# Course: 5210 Data Visualization
# Purpose: Technical Appendix for Midterm Project
# Date: July 29, 2021
# Author: Jennifer Grosz and Josh Wolfe
# Clear environment of variables and functions
rm(list = ls(all = TRUE)) 

# Clear environment of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
# Load relevant packages used for analyzing General Mills data
library(tidyverse) # contains ggplot2, dplyr, and several other packages used
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(vtable) # contains vtable function for summary table of data
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(janitor) # contains tidyverse functions for cross-tables
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(gridExtra) # contains grid.arrange function used to combine plots in the same window
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(patchwork) # use to put graphs together in the same frame
library(knitr) # contains some table formatting functions
library(kableExtra) # also contains functions used for table outputs
library(GGally) # contains a custom correlation plot 
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(moments) # to calculate skewness
library(scales) # use for rounding numbers
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(corrplot) # for correlation plot
## corrplot 0.90 loaded
library(tidylog) # provides data set information reports
## 
## Attaching package: 'tidylog'
## The following objects are masked from 'package:dplyr':
## 
##     add_count, add_tally, anti_join, count, distinct, distinct_all,
##     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
##     full_join, group_by, group_by_all, group_by_at, group_by_if,
##     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
##     relocate, rename, rename_all, rename_at, rename_if, rename_with,
##     right_join, sample_frac, sample_n, select, select_all, select_at,
##     select_if, semi_join, slice, slice_head, slice_max, slice_min,
##     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
##     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
##     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
##     transmute_if, ungroup
## The following objects are masked from 'package:tidyr':
## 
##     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
##     spread, uncount
## The following object is masked from 'package:stats':
## 
##     filter
library(tidyr) # provides uncount function
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:tidylog':
## 
##     summarize
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(MultinomialCI)
library(htmlTable)
library(colorspace)
options(scipen = 999) # remove scientific notation
# load product data
product_data <- read.csv("../data/mtp_product_data.csv")

# load sales data
sales_data <- read.csv("../data/mtp_sales_data.csv")
# Need to mutate UPC values to merge data sets because they're different in each data set. For example:
# 
# - sales_data has UPC as 01.16000.11653
# - product_data has UPC as 00-01-16000-11653

# change UPC to merge data sets
product_data <- product_data %>%
  # start by separating values by -."
  separate("UPC", into = c("first", "second", "third", "fourth"), sep = "-") %>%
  # unite columns to make UPC the same as it is in sales_data
  unite(col = "UPC", c("second", "third", "fourth"), sep = ".") %>%
  # drop column with leading zeros from product_data's UPC
  select(-first)
## select: dropped one variable (first)
# Join data sets
gm_joined_data <- right_join(sales_data, product_data, by = "UPC")
## right_join: added 4 columns (brand, flavor, volume, package)
##             > rows only in x  (     0)
##             > rows only in y        0
##             > matched rows     21,850    (includes duplicates)
##             >                 ========
##             > rows total       21,850
# Create cereal variable
# Reduce brand variable to just brand name

gm_joined_data <- gm_joined_data %>%
  mutate(
    cereal = case_when(
      str_sub(brand, 1,  7) == "GENERAL" ~ str_sub(brand, 15, -1),
      str_sub(brand, 1, 8) == "KELLOGGS" ~ str_sub(brand, 10, -1),
      str_sub(brand, 1, 4) == "POST" ~ str_sub(brand, 6, -1)),
    brand = case_when(
    str_sub(brand, 1,  7) == "GENERAL" ~ "GENERAL MILLS",
    str_sub(brand, 1, 8) == "KELLOGGS" ~ "KELLOGGS",
    str_sub(brand, 1, 4) == "POST" ~ "POST")
  )
## mutate: changed 21,850 values (100%) of 'brand' (0 new NA)
##         new variable 'cereal' (character) with 15 unique values and 0% NA
gm_only <- subset(gm_joined_data, brand == "GENERAL MILLS")
# Convert variables to factors
gm_joined_data[,'promo'] <- factor(gm_joined_data[,'promo'])
gm_joined_data[,'ad'] <- factor(gm_joined_data[,'ad'])
gm_joined_data[,'brand'] <- factor(gm_joined_data[,'brand'])
gm_joined_data[,'cereal'] <- factor(gm_joined_data[,'cereal'])
gm_joined_data[,'flavor'] <- factor(gm_joined_data[,'flavor'])
gm_joined_data[,'package'] <- factor(gm_joined_data[,'package'])
gm_joined_data[,'iri_key'] <- factor(gm_joined_data[,'iri_key'])

gm_only[,'promo'] <- factor(gm_only[,'promo'])
gm_only[,'ad'] <- factor(gm_only[,'ad'])
gm_only[,'brand'] <- factor(gm_only[,'brand'])
gm_only[,'cereal'] <- factor(gm_only[,'cereal'])
gm_only[,'flavor'] <- factor(gm_only[,'flavor'])
gm_only[,'package'] <- factor(gm_only[,'package'])
gm_only[,'iri_key'] <- factor(gm_only[,'iri_key'])
# Create total sales data frame
gm_total_sales <- uncount(gm_joined_data, units)
## uncount: now 187,450 rows and 11 columns, ungrouped
gm_only <- uncount(gm_only, units)
## uncount: now 69,017 rows and 11 columns, ungrouped

1 Base EDA Step 1: Uni-variate non-graphical EDA

  • Examine the data
# Look at the top few rows of the data
head(gm_joined_data)
##              UPC iri_key week units price promo   ad         brand
## 1 01.16000.11653  644347    6     5   0.5     0    A GENERAL MILLS
## 2 01.16000.11653  248741    5     2   0.5     0 NONE GENERAL MILLS
## 3 01.16000.11653  535806   11     3   0.5     0 NONE GENERAL MILLS
## 4 01.16000.11945  675634   11     2   0.5     0 NONE GENERAL MILLS
## 5 01.16000.11945  205272   13     8   0.5     0 NONE GENERAL MILLS
## 6 01.16000.11945  248741   14     5   0.5     0 NONE GENERAL MILLS
##           flavor volume package          cereal
## 1 CINNAMON TOAST   0.06     BOX CINNAMON TST CR
## 2 CINNAMON TOAST   0.06     BOX CINNAMON TST CR
## 3 CINNAMON TOAST   0.06     BOX CINNAMON TST CR
## 4        TOASTED   0.04     BOX        CHEERIOS
## 5        TOASTED   0.04     BOX        CHEERIOS
## 6        TOASTED   0.04     BOX        CHEERIOS
  • Data appears to be tidy

    • Each column is of the same variable type and has a unique entry for each row
    • There are no missing or duplicate values
# Get a breakdown of the variables
str(gm_joined_data)
## 'data.frame':    21850 obs. of  12 variables:
##  $ UPC    : chr  "01.16000.11653" "01.16000.11653" "01.16000.11653" "01.16000.11945" ...
##  $ iri_key: Factor w/ 1420 levels "200171","200197",..: 1041 446 1018 1217 48 446 1295 794 1184 1043 ...
##  $ week   : int  6 5 11 11 13 14 39 35 45 5 ...
##  $ units  : int  5 2 3 2 8 5 6 1 4 14 ...
##  $ price  : num  0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
##  $ promo  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ad     : Factor w/ 3 levels "A","B","NONE": 1 3 3 3 3 3 3 3 3 3 ...
##  $ brand  : Factor w/ 3 levels "GENERAL MILLS",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
##  $ volume : num  0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
##  $ package: Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
##  $ cereal : Factor w/ 15 levels "CHEERIOS","CINNAMON TST CR",..: 2 2 2 1 1 1 2 2 2 2 ...
# Get a breakdown of the variables
vtable(gm_joined_data)
gm_joined_data
Name Class Values
UPC character
iri_key factor ‘200171’ ‘200197’ ‘200272’ ‘200297’ ‘200341’ and more
week integer Num: 1 to 52
units integer Num: 1 to 28
price numeric Num: 0.25 to 9.99
promo factor ‘0’ ‘1’
ad factor ‘A’ ‘B’ ‘NONE’
brand factor ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’
flavor factor ‘CINNAMON TOAST’ ‘COCOA’ ‘FRUIT’ ‘REGULAR’ ‘TOASTED’
volume numeric Num: 0.04 to 4
package factor ‘BOX’ ‘CUP’
cereal factor ‘CHEERIOS’ ‘CINNAMON TST CR’ ‘COCOA KRISPIES’ ‘COCOA PUFFS’ ‘FROOT LOOPS’ and more
  • Two Numeric variables

  • Two integer variables

  • Seven factor variables

    • Promo is a binary variable where 0 is no promotion and 1 is promotion
    • Brand, cereal, and flavor are categories
    • Package is the ceral packaging type, box or cup
    • Ad is either NONE for no ad, A for small ad, or B for medium ad
# Get variable descriptive statistics
summary(gm_joined_data)
##      UPC               iri_key           week           units       
##  Length:21850       656444 :   35   Min.   : 1.00   Min.   : 1.000  
##  Class :character   256951 :   31   1st Qu.:14.00   1st Qu.: 3.000  
##  Mode  :character   259661 :   31   Median :27.00   Median : 7.000  
##                     267403 :   31   Mean   :26.62   Mean   : 8.579  
##                     652139 :   31   3rd Qu.:40.00   3rd Qu.:12.000  
##                     681735 :   31   Max.   :52.00   Max.   :28.000  
##                     (Other):21660                                   
##      price       promo        ad                  brand      
##  Min.   :0.250   0:17305   A   : 1456   GENERAL MILLS: 7189  
##  1st Qu.:3.190   1: 4545   B   : 1061   KELLOGGS     :12183  
##  Median :3.790             NONE:19333   POST         : 2478  
##  Mean   :3.763                                               
##  3rd Qu.:4.390                                               
##  Max.   :9.990                                               
##                                                              
##             flavor         volume      package                     cereal     
##  CINNAMON TOAST:1834   Min.   :0.040   BOX:21306   FROSTED FLAKES     : 2295  
##  COCOA         :1901   1st Qu.:0.750   CUP:  544   FROOT LOOPS        : 2192  
##  FRUIT         :2192   Median :1.060               CINNAMON TST CR    : 1834  
##  REGULAR       :8816   Mean   :1.016               LUCKY CHARMS       : 1681  
##  TOASTED       :7107   3rd Qu.:1.120               FROSTED MINI WHEATS: 1574  
##                        Max.   :4.000               CHEERIOS           : 1458  
##                                                    (Other)            :10816
  • Comments and questions about the data

    • Average units sold is positively skewed
    • Price per package is relatively uniform
    • Far more boxes are sold compared to cups
    • Regular flavor is the highest seller closely followed by Toasted
    • Kelloggs sells the most units
    • Post sells the fewest units by a significant margin

1.1 Categorical Variables

  • The categorical variables in this data set are iri_key (store key), week, brand, flavor, package, promo, and ad

  • To better understand these variables will look at two elements:

    • absolute count

      • shows the number of observations or occurrences of each type of data
    • relative proportion or percent

      • shows a count of each factor level divided by the total count of a variable

1.1.1 iri_key

gm_joined_data %>%
  group_by(iri_key) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            total_units_sold = sum(units), # calculate total number of units sold
            proportion = sum(count) / nrow(gm_joined_data), # Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent))
## # A tibble: 1,420 x 5
##    iri_key count total_units_sold proportion percent
##    <fct>   <int>            <int>      <dbl> <chr>  
##  1 656444     35              170    0.00160 0.1602%
##  2 256951     31              308    0.00142 0.1419%
##  3 259661     31              173    0.00142 0.1419%
##  4 267403     31              260    0.00142 0.1419%
##  5 652139     31              167    0.00142 0.1419%
##  6 681735     31              235    0.00142 0.1419%
##  7 1094689    31              249    0.00142 0.1419%
##  8 651600     30              210    0.00137 0.1373%
##  9 240692     29              210    0.00133 0.1327%
## 10 247300     29              268    0.00133 0.1327%
## # … with 1,410 more rows
# there are 1420 separate stores in this data set
gm_joined_data %>%
  group_by(iri_key) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            total_units_sold = sum(units), # calculate total number of units sold
            proportion = sum(count) / nrow(gm_joined_data), # Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) 
## group_by: one grouping variable (iri_key)
## summarise: now 1,420 rows and 5 columns, ungrouped
## mutate: converted 'percent' from double to character (0 new NA)
## # A tibble: 1,420 x 5
##    iri_key count total_units_sold proportion percent
##    <fct>   <int>            <int>      <dbl> <chr>  
##  1 656444     35              170    0.00160 0.1602%
##  2 256951     31              308    0.00142 0.1419%
##  3 259661     31              173    0.00142 0.1419%
##  4 267403     31              260    0.00142 0.1419%
##  5 652139     31              167    0.00142 0.1419%
##  6 681735     31              235    0.00142 0.1419%
##  7 1094689    31              249    0.00142 0.1419%
##  8 651600     30              210    0.00137 0.1373%
##  9 240692     29              210    0.00133 0.1327%
## 10 247300     29              268    0.00133 0.1327%
## # … with 1,410 more rows
# there are 1420 separate stores in this data set

Comments:

- distribution of observations per store looks fairly even
- No single store makes up a significant proportion or percent of this data set, highest count is 35 followed by a group of stores with 31 observations
- There are a number of stores with less than 10 observations, so we might have too small of a sample size for those stores 

1.1.2 week

week <- gm_joined_data %>%
  group_by(week) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            proportion = sum(count) / nrow(gm_joined_data), # Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) 

# there are 1420 separate stores in this data set
week count proportion percent
22 475 0.0217391 2.1739%
12 473 0.0216476 2.1648%
37 464 0.0212357 2.1236%
29 449 0.0205492 2.0549%
20 445 0.0203661 2.0366%
43 445 0.0203661 2.0366%
48 445 0.0203661 2.0366%
38 441 0.0201831 2.0183%
40 441 0.0201831 2.0183%
44 441 0.0201831 2.0183%
30 439 0.0200915 2.0092%
32 439 0.0200915 2.0092%
45 439 0.0200915 2.0092%
36 436 0.0199542 1.9954%
7 435 0.0199085 1.9908%
18 435 0.0199085 1.9908%
27 432 0.0197712 1.9771%
47 431 0.0197254 1.9725%
14 429 0.0196339 1.9634%
49 429 0.0196339 1.9634%
6 428 0.0195881 1.9588%
5 427 0.0195423 1.9542%
24 425 0.0194508 1.9451%
28 424 0.0194050 1.9405%
25 423 0.0193593 1.9359%
13 421 0.0192677 1.9268%
19 421 0.0192677 1.9268%
26 420 0.0192220 1.9222%
46 420 0.0192220 1.9222%
42 418 0.0191304 1.9130%
17 414 0.0189474 1.8947%
39 414 0.0189474 1.8947%
11 412 0.0188558 1.8856%
41 412 0.0188558 1.8856%
4 409 0.0187185 1.8719%
31 409 0.0187185 1.8719%
16 403 0.0184439 1.8444%
3 402 0.0183982 1.8398%
2 401 0.0183524 1.8352%
51 400 0.0183066 1.8307%
23 399 0.0182609 1.8261%
35 399 0.0182609 1.8261%
9 398 0.0182151 1.8215%
15 397 0.0181693 1.8169%
34 394 0.0180320 1.8032%
33 393 0.0179863 1.7986%
10 389 0.0178032 1.7803%
21 389 0.0178032 1.7803%
52 388 0.0177574 1.7757%
8 386 0.0176659 1.7666%
1 383 0.0175286 1.7529%
50 369 0.0168879 1.6888%

Comments:

- distribution of observations per week looks fairly even
- No single week makes up a significant proportion or percent of this data set, highest count is 475 and the lowest count is 369
- might want to convert weeks into month/date values so we could look for seasonal or quarterly trends 

1.1.3 promo

promo <- gm_joined_data %>%
  group_by(promo) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) # convert to percentage
promo count proportion percent
0 17305 0.79 79%
1 4545 0.21 21%

Comments:

- significantly more observations in this data set were recorded when there was not an in store promotion than when there was a promotion
- 79% of the observations are no in-store promotions sales
- 21% of the observations are in-store promotions sales

1.1.5 brand

brand <- gm_joined_data %>%
  group_by(brand) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) # convert to percentage
brand count proportion percent
KELLOGGS 12183 0.56 56%
GENERAL MILLS 7189 0.33 33%
POST 2478 0.11 11%

Comments:

- Confirms Kelloggs had the most sales and Post had the least sales
- 56% of the observations are Kelloggs brand sales
- 33% of the observations are General Mills brand sales
- 11% of the observations are Post brand sales

1.1.6 flavor

flavor <- gm_joined_data %>%
  group_by(flavor) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) # convert to percentage
flavor count proportion percent
REGULAR 8816 0.40 40.35%
TOASTED 7107 0.33 32.53%
FRUIT 2192 0.10 10.03%
COCOA 1901 0.09 8.70%
CINNAMON TOAST 1834 0.08 8.39%

Comments:

- Confirms Regular flavor is the highest seller closely followed by Toasted
- 40.35% of the observations are Regular flavored sales
- 32.53% of the observations are Toasted flavored sales
- 10.03% of the observations are Fruit flavored sales
- 8.70% of the observations are Cocoa flavored sales
- 8.39% of the observations are Cinnamon Toast flavored sales

1.1.7 package

package <- gm_joined_data %>%
  group_by(package) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) # convert to percentage
package count proportion percent
BOX 21306 0.98 98%
CUP 544 0.02 2%

Comments:

- Confirms significantly more observations in this data set were packaged in a box
- 98% of the observations are packaged in a box
- 2% of the observations are packaged in a cup

1.1.8 cereal

cereal <- gm_joined_data %>%
  group_by(cereal) %>%
  summarise(count = n(),  # make calculations to summarize variable and create counting function
            proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
            percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable 
  arrange(-percent) %>% # display table results by percent in descending order
  mutate(percent = percent(percent)) # convert to percentage
cereal count proportion percent
FROSTED FLAKES 2295 0.11 10.503%
FROOT LOOPS 2192 0.10 10.032%
CINNAMON TST CR 1834 0.08 8.394%
LUCKY CHARMS 1681 0.08 7.693%
FROSTED MINI WHEATS 1574 0.07 7.204%
CHEERIOS 1458 0.07 6.673%
RICE KRISPIES 1450 0.07 6.636%
SPECIAL K 1391 0.06 6.366%
GRAPE NUTS 1289 0.06 5.899%
RAISIN BRAN 1266 0.06 5.794%
KIX 1196 0.05 5.474%
SHREDDED WHEAT 1189 0.05 5.442%
SMART START 1134 0.05 5.190%
COCOA PUFFS 1020 0.05 4.668%
COCOA KRISPIES 881 0.04 4.032%

Comments:

- distribution seems fairly balanced over cereal types
- there may be a low number of Cocoa Krispies when split up

1.2 Quantitative Variables

  • The Quantitative variables in this data set are units, price, and volume

  • To better understand the Quantitative variables we will look at the following elements:

    • central tendency

      • shows the average of a quantity
    • variation

      • measures the change of difference in quantity
      • range
      • standard deviation
    • skewness

      • measures the symmetry of the distribution of a variable

1.2.1 units

units <- gm_joined_data %>% 
summarise(mean = mean(units), # calculate mean
          median = median(units), # calculate median
          max = max(units), # calculate max
          min = min(units), # calculate min
          standard_deviation = sd(units), # calculate sd
          skew = skewness(units)) # calculate skew
mean median max min standard_deviation skew
8.578947 7 28 1 6.70199 0.9981163

Comments:

  • skewness of units is 1

    • Confirms positive skew
  • mean (8.5789474) is greater than median (7)

  • median will be a better measure of central tendency.

  • range (max - min) of price = 28 - 1 = 27

  • standard deviation of price = 6.7

1.2.2 price

price <- gm_joined_data %>% 
summarise(mean = mean(price), # calculate mean
          median = median(price), # calculate median
          max = max(price), # calculate max
          min = min(price), # calculate min
          standard_deviation = sd(price), # calculate sd
          skew = skewness(price)) # calculate skew
mean median max min standard_deviation skew
3.763466 3.79 9.99 0.25 0.9971157 -0.22296

Comments:

  • skewness of price is -0.22, small skew

  • mean ($3.76) is less than median ($3.79)

  • range (max - min) of price = $9.99 - $0.25 = $9.74

  • standard deviation of price = $1.00

1.2.3 volume

volume <- gm_joined_data %>% 
summarise(mean = mean(volume), # calculate mean
          median = median(volume), # calculate median
          max = max(volume), # calculate max
          min = min(volume), # calculate min
          standard_deviation = sd(volume), # calculate sd
          skew = skewness(volume)) # calculate skew
mean median max min standard_deviation skew
1.01561 1.06 4 0.04 0.3703431 0.540629

Comments:

- skewness of volume is 0.54, small skew

- mean (1.02) is less than median (1.06) 
- median will be a better measure of central tendency.

- range (max - min) of price = 4 - 0.04 = 3.96

- standard deviation of price = 0.37

1.3 Questions/Comments

  • Store (iri_key) - distribution of observations per store looks fairly even

  • Week - distribution of observations appears to be fairly even across all weeks

    • we may want to convert weeks into month/date values to evaluate seasonal or quarterly trends
  • Promo - significantly more observations were recorded when there was not an in store promotion than when there was a promotion

  • Ad - significantly more observations were recorded when there was not an advertisements

  • Brand - Kelloggs had the most sales and Post had the least sales

  • Flavor - Regular flavor is the highest seller closely followed by Toasted

  • Cereal - distribution seems to be fairly balanced over cereal type, but there may be a low number of Cocoa Krispies when split up

  • Package - significantly more observations in this data set were packaged in a box

  • Units - positive skew

  • Price - slight negative skew

  • Volume - slight positive skew

2 Base EDA Step 2: Uni-variate graphical EDA

2.1 Categorical Variables

  • Uni-variate graphical EDA for categorical variables will be bar graphs showing the count of observations per category

2.1.1 iri_key

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = iri_key)) +
 geom_bar() 

Comments:

-  this is not a very clear visual, might want to identify specific stores for further analysis

2.1.2 week

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(week))) +
 geom_bar() 

Comments:

- confirms even distribution of observations over each week

2.1.3 promo

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(promo))) +
 geom_bar() 

Comments:

- confirms more sales were made without an in store promotion
- is this representative of the typical distribution?

2.1.4 ad

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(ad))) +
 geom_bar() 

Comments:

- confirms the most sales were made without an advertisement
- is this representative of the typical distribution?

2.1.5 brand

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(brand))) +
 geom_bar() 

Comments:

- confirms Kelloggs had the most sales and Post had the least sales

2.1.6 flavor

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(flavor))) +
 geom_bar() 

Comments:

- confirms Regular and Toasted had the most sales 
- Cinnamon Toast had the least sales

2.1.7 package

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(package))) +
 geom_bar() 

Comments:

- confirms box Packaging had the most sales 
- is this representative of a typical distribution?

2.1.8 cereal

# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(cereal))) +
  geom_bar() 

Comments:

- there appears to be a good number observations for each volume category

2.2 Quantitative Variables

- Uni-variate graphical EDA for quantitative variables will be histogram plots, box plots and density plots

2.2.1 units

# Create histogram
units_histogram <- ggplot(data = gm_joined_data, mapping = aes(x = units)) + 
  geom_histogram(bins = 10)

# Create box plot
units_boxplot <- ggplot(data = gm_joined_data, mapping = aes(x = 1)) + 
  geom_boxplot(mapping = aes(y = units)) 

# create density plot
units_density <- ggplot(data = gm_joined_data, aes(x = units)) +
  geom_line(stat = "density", color = "red") 

# output
units_boxplot + (units_histogram / units_density) 

- Confirms skewed distribution

- will use median rather than mean

- observations with more than 30 units appear to be outliers, but they should be kept in the data set

2.2.2 price

# Create histogram
price_histogram <- ggplot(data = gm_joined_data, mapping = aes(x = price)) + 
  geom_histogram(bins = 10)

# Create boxplot
price_boxplot <- ggplot(data = gm_joined_data, mapping = aes(x = 1)) + 
  geom_boxplot(mapping = aes(y = price)) 

# create density plot
price_density <- ggplot(data = gm_joined_data, aes(x = price)) +
  geom_line(stat = "density", color = "red") 

# output
price_boxplot + (price_histogram / price_density) 

- Distribution appears to be normal

- will use mean

- price greater than 7.5 might appear to be outliers, but they are important and shouldn't be removed

2.2.3 volume

# Create histogram
volume_histogram <- ggplot(data = gm_joined_data, mapping = aes(x = volume)) + 
  geom_histogram(bins = 10)

# Create boxplot
volume_boxplot <- ggplot(data = gm_joined_data, mapping = aes(x = 1)) + 
  geom_boxplot(mapping = aes(y = volume)) 

# create density plot
volume_density <- ggplot(data = gm_joined_data, aes(x = volume)) +
  geom_line(stat = "density", color = "red") 

# output
volume_boxplot + (volume_histogram / volume_density) 

- Confirms slightly skewed distribution

- will use median rather than mean

- volume of 2 or above appears to be outliers, but they are important and shouldn't be removed

2.3 Questions/Comments

  • Store (iri_key) - not a clear visual, hard to see what’s going on

  • Week - confirms fairly even number of observations across all weeks

  • Promo - is this distribution representative of the expected number of sales made with there’s a promo compared to the number of sales when there isn’t a promo?

  • Ad - is this distribution representative of the expected number of sales made with there’s an advertisement compared to the number of sales when there isn’t an advertisement?

  • Flavor - Cinnamon Toast had the least sales

  • Package - is this distribution representative of the expected number of sales made for box packaging compared to cup packaging?

  • Cereal - there appears to be a good number observations for each volume category. I don’t think there will be an issue with the slightly lower number of Cocoa Krispies observations when split up

  • Units - use median rather than mean

  • Price - use mean, distribution appears normal

  • Volume - use median rather than mean

3 Base EDA Step 3: Multi-Variate Non-Graphical EDA

3.1 Categorical Variables

3.1.1 iri_key

3.1.1.1 iri_key and week

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, week) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30
##   200171 0  1  1  0  0  0  0  2  0  1  0 1  1  1  0  0  0  0  0  0  0  0 0  0
##   200197 1  0  1  0  1  0  0  2  1  0  0 1  1  0  0  1  0  1  1  3  0  1 0  0
##   200272 0  0  1  1  0  0  0  0  0  0  0 0  0  0  0  2  0  0  0  0  0  0 1  0
##   200297 0  0  1  0  0  0  0  0  0  0  0 0  0  0  0  2  0  1  2  0  0  0 0  0
##   200341 0  0  0  1  0  1  0  1  0  0  0 1  2  1  0  1  0  1  0  0  1  0 0  0
##   200379 0  0  0  1  2  0  0  0  0  0  0 1  0  0  0  0  0  0  0  1  0  0 1  0
##  31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 6 7 8 9
##   1  0  0  0  1  0  0  0  1 0  0  0  0  0  0  1  0  2  0  0 0  1  0  0 0 0 0 0
##   1  0  0  0  0  1  1  0  0 0  1  0  0  0  2  0  0  1  0  0 2  1  1  0 1 0 0 0
##   0  0  1  0  0  0  1  0  0 0  0  0  0  1  0  0  1  1  0  0 0  1  2  0 2 0 1 1
##   0  1  0  1  0  0  1  0  1 0  1  0  0  0  0  0  1  0  1  0 0  0  0  0 1 0 0 0
##   1  0  0  0  1  0  1  0  0 0  0  0  0  0  1  1  2  0  0  0 1  0  0  0 0 0 0 0
##   0  0  1  1  0  1  0  1  1 1  0  0  2  1  0  1  0  0  0  0 0  0  0  0 2 0 1 0
##  Total
##     15
##     27
##     17
##     14
##     18
##     19

Comments:

- sample size for each group is too small, which means are not going to be able to come up with any reliable findings

3.1.1.2 iri_key and promo

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, promo) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key  0 1 Total
##   200171 10 5    15
##   200197 18 9    27
##   200272  9 8    17
##   200297  7 7    14
##   200341  9 9    18
##   200379 17 2    19

Comments:

- sample size for each group looks small, which means we may not be able to come up with any reliable findings about promotions per store

3.1.1.3 iri_key and ad

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, ad) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key A B NONE Total
##   200171 4 2    9    15
##   200197 4 0   23    27
##   200272 0 0   17    17
##   200297 1 0   13    14
##   200341 0 1   17    18
##   200379 1 0   18    19

Comments:

- largest sample size for each store when there is no advertisement, we may not be able to come up with any reliable findings about advertisements per store

3.1.1.4 iri_key and brand

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, brand) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key GENERAL MILLS KELLOGGS POST Total
##   200171             4       10    1    15
##   200197            10       15    2    27
##   200272             3       10    4    17
##   200297             5        6    3    14
##   200341             6       10    2    18
##   200379            10        7    2    19

Comments:

- sample size for each group looks small, which means we may not be able to come up with any reliable findings about brand sales per store

3.1.1.5 iri_key and flabor

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, flavor) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key CINNAMON TOAST COCOA FRUIT REGULAR TOASTED Total
##   200171              2     3     1       3       6    15
##   200197              1     0     4      12      10    27
##   200272              0     0     3       8       6    17
##   200297              4     0     1       3       6    14
##   200341              1     2     1      11       3    18
##   200379              3     1     2       4       9    19

Comments:

- sample size for each group looks small, which means we may not be able to come up with any reliable findings about flavors per store

3.1.1.6 iri_key and package

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, package) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key BOX CUP Total
##   200171  15   0    15
##   200197  27   0    27
##   200272  17   0    17
##   200297  14   0    14
##   200341  18   0    18
##   200379  19   0    19

Comments:

- sample size for cup packaging across all stores is small, we won't be able to come up with any reliable findings about cup packaging per store

3.1.1.7 iri_key and cereal

# creates table of counts
gm_joined_data %>%
  tabyl(iri_key, cereal) %>%
  adorn_totals(where = c("row", "col")) %>%
  head()
##  iri_key CHEERIOS CINNAMON TST CR COCOA KRISPIES COCOA PUFFS FROOT LOOPS
##   200171        1               2              2           1           1
##   200197        1               1              0           0           4
##   200272        2               0              0           0           3
##   200297        0               4              0           0           1
##   200341        0               1              1           1           1
##   200379        3               3              0           1           2
##  FROSTED FLAKES FROSTED MINI WHEATS GRAPE NUTS KIX LUCKY CHARMS RAISIN BRAN
##               1                   0          1   0            0           1
##               3                   2          1   4            4           1
##               2                   0          4   1            0           1
##               0                   0          2   0            1           0
##               3                   2          1   4            0           0
##               2                   0          2   0            3           0
##  RICE KRISPIES SHREDDED WHEAT SMART START SPECIAL K Total
##              2              0           1         2    15
##              4              1           0         1    27
##              2              0           2         0    17
##              2              1           3         0    14
##              1              1           1         1    18
##              2              0           1         0    19

Comments:

- sample size for each group looks small, we may not be able to come up with any reliable findings about cereal per store

3.1.2 week

3.1.2.1 week and promo

# creates table of counts
week_promo <- gm_joined_data %>%
  tabyl(week, promo) %>%
  adorn_totals(where = c("row", "col")) 
week 0 1 Total
1 315 68 383
2 320 81 401
3 322 80 402
4 318 91 409
5 346 81 427
6 343 85 428
7 323 112 435
8 286 100 386
9 298 100 398
10 300 89 389
11 345 67 412
12 372 101 473
13 339 82 421
14 326 103 429
15 316 81 397
16 323 80 403
17 329 85 414
18 333 102 435
19 332 89 421
20 355 90 445
21 297 92 389
22 379 96 475
23 312 87 399
24 320 105 425
25 335 88 423
26 320 100 420
27 350 82 432
28 342 82 424
29 364 85 449
30 358 81 439
31 320 89 409
32 336 103 439
33 303 90 393
34 291 103 394
35 303 96 399
36 342 94 436
37 387 77 464
38 365 76 441
39 345 69 414
40 340 101 441
41 343 69 412
42 334 84 418
43 343 102 445
44 353 88 441
45 326 113 439
46 326 94 420
47 344 87 431
48 363 82 445
49 372 57 429
50 295 74 369
51 339 61 400
52 317 71 388
Total 17305 4545 21850

Comments:

- sample size for each group looks good
- every week has more observations when there is no in store promotion 

3.1.2.2 week and ad

# creates table of counts
week_ad <- gm_joined_data %>%
  tabyl(week, ad) %>%
  adorn_totals(where = c("row", "col")) 

small_sample_a <- week_ad %>%
  filter(A < 20)

small_sample_b <- week_ad %>%
  filter(B < 20)
week A B NONE Total
1 25 16 342 383
2 38 15 348 401
3 36 14 352 402
4 27 30 352 409
5 12 17 398 427
6 37 40 351 428
7 62 31 342 435
8 53 16 317 386
9 33 30 335 398
10 48 28 313 389
11 30 18 364 412
12 42 35 396 473
13 36 11 374 421
14 31 17 381 429
15 13 39 345 397
16 35 12 356 403
17 24 39 351 414
18 23 23 389 435
19 28 28 365 421
20 20 19 406 445
21 29 36 324 389
22 28 16 431 475
23 36 13 350 399
24 25 16 384 425
25 39 21 363 423
26 43 24 353 420
27 24 29 379 432
28 24 21 379 424
29 34 23 392 449
30 31 20 388 439
31 28 22 359 409
32 29 25 385 439
33 25 21 347 393
34 21 16 357 394
35 41 14 344 399
36 23 19 394 436
37 31 19 414 464
38 24 12 405 441
39 4 14 396 414
40 25 24 392 441
41 24 14 374 412
42 18 12 388 418
43 15 15 415 445
44 34 18 389 441
45 36 17 386 439
46 18 19 383 420
47 7 20 404 431
48 11 25 409 445
49 16 8 405 429
50 19 17 333 369
51 21 7 372 400
52 20 6 362 388
Total 1456 1061 19333 21850

Comments:

- Medium Advertisements (A) have small sample sizes for the following weeks:
week A B NONE Total
5 12 17 398 427
15 13 39 345 397
39 4 14 396 414
42 18 12 388 418
43 15 15 415 445
46 18 19 383 420
47 7 20 404 431
48 11 25 409 445
49 16 8 405 429
50 19 17 333 369
- Small Advertisements (B) have small sample sizes for the following weeks:
week A B NONE Total
1 25 16 342 383
2 38 15 348 401
3 36 14 352 402
5 12 17 398 427
8 53 16 317 386
11 30 18 364 412
13 36 11 374 421
14 31 17 381 429
16 35 12 356 403
20 20 19 406 445
22 28 16 431 475
23 36 13 350 399
24 25 16 384 425
34 21 16 357 394
35 41 14 344 399
36 23 19 394 436
37 31 19 414 464
38 24 12 405 441
39 4 14 396 414
41 24 14 374 412
42 18 12 388 418
43 15 15 415 445
44 34 18 389 441
45 36 17 386 439
46 18 19 383 420
49 16 8 405 429
50 19 17 333 369
51 21 7 372 400
52 20 6 362 388

3.1.2.3 week and brand

# creates table of counts
week_brand <- gm_joined_data %>%
  tabyl(week, brand) %>%
  adorn_totals(where = c("row", "col")) 
week GENERAL MILLS KELLOGGS POST Total
1 134 219 30 383
2 142 221 38 401
3 141 208 53 402
4 137 218 54 409
5 162 221 44 427
6 145 231 52 428
7 136 250 49 435
8 124 207 55 386
9 152 185 61 398
10 148 197 44 389
11 142 213 57 412
12 171 245 57 473
13 143 237 41 421
14 144 243 42 429
15 137 219 41 397
16 127 227 49 403
17 146 213 55 414
18 159 223 53 435
19 130 243 48 421
20 131 260 54 445
21 130 208 51 389
22 154 260 61 475
23 135 201 63 399
24 160 216 49 425
25 125 246 52 423
26 126 249 45 420
27 144 230 58 432
28 137 233 54 424
29 141 257 51 449
30 145 233 61 439
31 116 243 50 409
32 142 258 39 439
33 120 238 35 393
34 100 248 46 394
35 129 231 39 399
36 127 269 40 436
37 134 283 47 464
38 148 251 42 441
39 124 250 40 414
40 135 254 52 441
41 122 242 48 412
42 151 242 25 418
43 156 242 47 445
44 155 242 44 441
45 139 253 47 439
46 147 220 53 420
47 126 261 44 431
48 150 251 44 445
49 125 256 48 429
50 127 197 45 369
51 124 225 51 400
52 144 214 30 388
Total 7189 12183 2478 21850

Comments:

- sample size for each group looks good

3.1.2.4 week and flavor

# creates table of counts
week_flavor <- gm_joined_data %>%
  tabyl(week, flavor) %>%
  adorn_totals(where = c("row", "col")) 
week CINNAMON TOAST COCOA FRUIT REGULAR TOASTED Total
1 38 24 31 174 116 383
2 40 33 30 166 132 401
3 32 37 35 166 132 402
4 36 39 35 177 122 409
5 40 35 37 166 149 427
6 36 29 34 185 144 428
7 38 29 47 170 151 435
8 41 30 34 158 123 386
9 43 31 36 166 122 398
10 35 37 35 141 141 389
11 30 35 34 174 139 412
12 45 43 36 192 157 473
13 34 40 45 160 142 421
14 31 37 35 178 148 429
15 33 38 35 159 132 397
16 48 34 43 160 118 403
17 45 27 39 160 143 414
18 39 34 33 186 143 435
19 34 32 33 173 149 421
20 32 37 40 192 144 445
21 42 32 29 167 119 389
22 36 28 51 197 163 475
23 37 22 39 173 128 399
24 41 36 47 161 140 425
25 38 35 46 177 127 423
26 30 34 46 180 130 420
27 44 29 48 173 138 432
28 35 48 49 158 134 424
29 32 46 63 160 148 449
30 35 47 50 190 117 439
31 34 41 43 183 108 409
32 30 51 43 160 155 439
33 38 48 46 152 109 393
34 20 32 47 161 134 394
35 32 41 50 148 128 399
36 32 40 53 177 134 436
37 26 34 51 199 154 464
38 40 40 41 181 139 441
39 29 45 52 156 132 414
40 41 38 36 198 128 441
41 39 46 42 164 121 412
42 34 36 45 154 149 418
43 36 45 36 174 154 445
44 39 39 49 169 145 441
45 31 36 37 182 153 439
46 22 38 45 164 151 420
47 31 47 41 164 148 431
48 38 40 61 166 140 445
49 36 38 41 165 149 429
50 26 26 46 152 119 369
51 25 30 45 175 125 400
52 35 32 47 133 141 388
Total 1834 1901 2192 8816 7107 21850

Comments:

- sample size for each group looks good

3.1.2.5 week and package

# creates table of counts
week_package <- gm_joined_data %>%
  tabyl(week, package) %>%
  adorn_totals(where = c("row", "col")) 
week BOX CUP Total
1 373 10 383
2 392 9 401
3 393 9 402
4 401 8 409
5 413 14 427
6 414 14 428
7 426 9 435
8 382 4 386
9 392 6 398
10 377 12 389
11 403 9 412
12 461 12 473
13 414 7 421
14 420 9 429
15 391 6 397
16 389 14 403
17 407 7 414
18 428 7 435
19 409 12 421
20 438 7 445
21 382 7 389
22 467 8 475
23 392 7 399
24 411 14 425
25 411 12 423
26 409 11 420
27 419 13 432
28 415 9 424
29 437 12 449
30 427 12 439
31 400 9 409
32 427 12 439
33 387 6 393
34 380 14 394
35 385 14 399
36 427 9 436
37 457 7 464
38 426 15 441
39 406 8 414
40 431 10 441
41 403 9 412
42 407 11 418
43 437 8 445
44 427 14 441
45 425 14 439
46 412 8 420
47 415 16 431
48 427 18 445
49 411 18 429
50 359 10 369
51 385 15 400
52 379 9 388
Total 21306 544 21850

Comments:

- sample size for cup packaging across weeks is small, we won't be able to come up with any reliable findings about cup packaging per week

3.1.2.6 week and cereal

# creates table of counts
week_cereal <- gm_joined_data %>%
  tabyl(week, cereal) %>%
  adorn_totals(where = c("row", "col")) 
week CHEERIOS CINNAMON TST CR COCOA KRISPIES COCOA PUFFS FROOT LOOPS FROSTED FLAKES FROSTED MINI WHEATS GRAPE NUTS KIX LUCKY CHARMS RAISIN BRAN RICE KRISPIES SHREDDED WHEAT SMART START SPECIAL K Total
1 25 38 8 16 31 61 31 18 23 32 29 23 12 15 21 383
2 28 40 17 16 30 45 36 22 18 40 29 22 16 22 20 401
3 30 32 14 23 35 45 34 28 19 37 15 22 25 16 27 402
4 24 36 13 26 35 46 30 31 19 32 28 24 23 20 22 409
5 37 40 18 17 37 43 23 27 30 38 26 26 17 16 32 427
6 38 36 7 22 34 58 25 29 19 30 31 29 23 19 28 428
7 28 38 8 21 47 52 37 27 11 38 21 25 22 33 27 435
8 18 41 9 21 34 44 22 31 13 31 24 33 24 17 24 386
9 27 43 8 23 36 32 27 31 24 35 22 22 30 21 17 398
10 35 35 9 28 35 29 34 25 12 38 22 30 19 15 23 389
11 26 30 10 25 34 43 28 32 22 39 24 35 25 19 20 412
12 36 45 18 25 36 46 31 28 29 36 29 28 29 25 32 473
13 29 34 15 25 45 45 28 20 25 30 21 29 21 31 23 421
14 30 31 11 26 35 48 39 26 20 37 28 29 16 26 27 429
15 29 33 14 24 35 47 29 24 19 32 23 27 17 23 21 397
16 23 48 14 20 43 41 26 24 16 20 28 31 25 18 26 403
17 29 45 13 14 39 34 31 26 23 35 17 28 29 24 27 414
18 28 39 8 26 33 44 37 28 30 36 22 25 25 24 30 435
19 27 34 13 19 33 53 33 29 19 31 20 34 19 22 35 421
20 21 32 18 19 40 58 32 27 26 33 22 38 27 19 33 445
21 22 42 15 17 29 52 26 24 16 33 22 21 27 24 19 389
22 39 36 15 13 51 47 39 33 28 38 22 21 28 34 31 475
23 27 37 7 15 39 32 25 26 27 29 26 25 37 22 25 399
24 33 41 14 22 47 34 35 28 23 41 20 23 21 20 23 425
25 20 38 17 18 46 36 42 24 19 30 28 30 28 17 30 423
26 25 30 18 16 46 48 29 29 23 32 35 30 16 23 20 420
27 26 44 12 17 48 49 21 33 19 38 26 33 25 22 19 432
28 23 35 23 25 49 40 29 30 22 32 13 22 24 27 30 424
29 30 32 29 17 63 40 19 24 26 36 23 28 27 24 31 449
30 27 35 20 27 50 40 31 32 29 27 28 25 29 15 24 439
31 16 34 27 14 43 45 35 28 26 26 27 19 22 21 26 409
32 32 30 26 25 43 53 28 21 21 34 18 36 18 20 34 439
33 22 38 28 20 46 36 39 18 22 18 20 18 17 22 29 393
34 22 20 21 11 47 39 31 24 25 22 20 30 22 31 29 394
35 17 32 22 19 50 39 29 17 24 37 15 24 22 24 28 399
36 28 32 23 17 53 50 34 19 19 31 34 33 21 21 21 436
37 30 26 19 15 51 48 34 20 32 31 38 29 27 32 32 464
38 34 40 21 19 41 57 32 15 25 30 25 36 27 17 22 441
39 28 29 26 19 52 40 30 24 25 23 21 33 16 23 25 414
40 27 41 22 16 36 52 35 25 25 26 34 26 27 26 23 441
41 23 39 27 19 42 38 29 23 20 21 29 34 25 15 28 412
42 29 34 18 18 45 43 29 9 30 40 27 25 16 23 32 418
43 32 36 21 24 36 47 27 22 32 32 21 34 25 27 29 445
44 34 39 23 16 49 37 33 25 31 35 24 32 19 17 27 441
45 42 31 21 15 37 46 30 25 29 22 30 32 22 26 31 439
46 33 22 15 23 45 39 25 24 25 44 22 30 29 15 29 420
47 22 31 23 24 41 48 29 17 20 29 23 31 27 20 46 431
48 31 38 17 23 61 48 31 24 23 35 20 35 20 22 17 445
49 32 36 25 13 41 40 27 27 19 25 31 31 21 27 34 429
50 24 26 13 13 46 34 19 24 33 31 21 23 21 15 26 369
51 27 25 17 13 45 47 31 27 24 35 22 18 24 21 24 400
52 33 35 11 21 47 37 28 15 17 38 20 23 15 16 32 388
Total 1458 1834 881 1020 2192 2295 1574 1289 1196 1681 1266 1450 1189 1134 1391 21850

Comments:

- sample size for Cocoa Krispies sold in the weeks 1, 6, 7, 8, 9, 10, 18, and 23 may be too small

- week 42 might have a small sample size for Grape Nuts

- however, given the number of observations per week for each cereal the small counts for Cocoa Krispies and Grape Nuts cereals might not be an issue

3.1.3 promo

- Split between in store promotions vs no in store promotions in data set is disproportionate

3.1.3.1 promo and ad

# creates table of counts
promo_ad <- gm_joined_data %>%
  tabyl(ad, promo) %>%
  adorn_totals(where = c("row", "col")) 
ad 0 1 Total
A 683 773 1456
B 402 659 1061
NONE 16220 3113 19333
Total 17305 4545 21850

Comments:

- sample size for each group looks fine

- the peak advertisement category is none for when there is and isn't a promotion 

3.1.3.2 promo and brand

# creates table of counts
promo_brand <- gm_joined_data %>%
  tabyl(brand, promo) %>%
  adorn_totals(where = c("row", "col")) 
brand 0 1 Total
GENERAL MILLS 5909 1280 7189
KELLOGGS 9470 2713 12183
POST 1926 552 2478
Total 17305 4545 21850

Comments:

- sample size for each group looks fine

- Kelloggs sells the most when there is and isn't an in store promotion

3.1.3.3 promo and flavor

# creates table of counts
promo_flavor <- gm_joined_data %>%
  tabyl(flavor, promo) %>%
  adorn_totals(where = c("row", "col")) 
flavor 0 1 Total
CINNAMON TOAST 1537 297 1834
COCOA 1416 485 1901
FRUIT 1673 519 2192
REGULAR 7057 1759 8816
TOASTED 5622 1485 7107
Total 17305 4545 21850

Comments:

- sample size for each group looks fine, Cinnamon Toast flavored sales when there is an in store promotion might have a small sample size

- regular flavor sells the most when there is and isn't an in store promotion

3.1.3.4 promo and package

# creates table of counts
promo_package <- gm_joined_data %>%
  tabyl(package, promo) %>%
  adorn_totals(where = c("row", "col")) 
package 0 1 Total
BOX 16910 4396 21306
CUP 395 149 544
Total 17305 4545 21850

Comments:

- sample size for each group looks fine, when there is an in store promotion sales with cup packaging might have a small sample size

- box packaging sells the most when there is and isn't an in store promotion

3.1.3.5 promo and cereal

# creates table of counts
promo_cereal <- gm_joined_data %>%
  tabyl(cereal, promo) %>%
  adorn_totals(where = c("row", "col")) 
cereal 0 1 Total
CHEERIOS 1240 218 1458
CINNAMON TST CR 1537 297 1834
COCOA KRISPIES 647 234 881
COCOA PUFFS 769 251 1020
FROOT LOOPS 1673 519 2192
FROSTED FLAKES 1819 476 2295
FROSTED MINI WHEATS 1248 326 1574
GRAPE NUTS 1002 287 1289
KIX 997 199 1196
LUCKY CHARMS 1366 315 1681
RAISIN BRAN 1060 206 1266
RICE KRISPIES 1162 288 1450
SHREDDED WHEAT 924 265 1189
SMART START 812 322 1134
SPECIAL K 1049 342 1391
Total 17305 4545 21850

Comments:

- sample size for each group looks fine, Kix or Raisin Bran sales when there is an in store promotion might have a small sample size

- Frosted Flakes sells the most when there is and isn't an in store promotion

3.1.4 ad

3.1.4.1 ad and brand

# creates table of counts
ad_brand <- gm_joined_data %>%
  tabyl(brand, ad) %>%
  adorn_totals(where = c("row", "col")) 
brand A B NONE Total
GENERAL MILLS 442 272 6475 7189
KELLOGGS 903 664 10616 12183
POST 111 125 2242 2478
Total 1456 1061 19333 21850

Comments:

- Post brand sales when there is an advertisement (A or B) might have a small sample size

- Kelloggs sells the most for every advertisement category 

3.1.4.2 ad and flavor

# creates table of counts
ad_flavor <- gm_joined_data %>%
  tabyl(flavor, ad) %>%
  adorn_totals(where = c("row", "col")) 
flavor A B NONE Total
CINNAMON TOAST 114 51 1669 1834
COCOA 128 91 1682 1901
FRUIT 153 108 1931 2192
REGULAR 516 439 7861 8816
TOASTED 545 372 6190 7107
Total 1456 1061 19333 21850

Comments:

- Cinnamon Toast and Cocoa flavored sales when there is an advertisement category of B have a small sample size, Fruit might also have a small sample size
  - When the advertisement category is A might also have a small sample size for Cinnamon Toast, Cocoa, and Fruit flavored sales

- Regular flavor sells the most when advertisement category is B or None

- Toasted sells the most when advertisement category is A

3.1.4.3 ad and package

# creates table of counts
ad_package <- gm_joined_data %>%
  tabyl(package, ad) %>%
  adorn_totals(where = c("row", "col")) 
package A B NONE Total
BOX 1444 1025 18837 21306
CUP 12 36 496 544
Total 1456 1061 19333 21850

Comments:

- Cup packaging has a small sample size  when there is an advertisement category of A or B 

- Box packaging sells the most for every advertisement category 

3.1.4.4 ad and cereal

# creates table of counts
ad_cereal <- gm_joined_data %>%
  tabyl(cereal, ad) %>%
  adorn_totals(where = c("row", "col")) 
cereal A B NONE Total
CHEERIOS 79 44 1335 1458
CINNAMON TST CR 114 51 1669 1834
COCOA KRISPIES 52 37 792 881
COCOA PUFFS 76 54 890 1020
FROOT LOOPS 153 108 1931 2192
FROSTED FLAKES 141 124 2030 2295
FROSTED MINI WHEATS 127 88 1359 1574
GRAPE NUTS 46 63 1180 1289
KIX 70 54 1072 1196
LUCKY CHARMS 103 69 1509 1681
RAISIN BRAN 67 47 1152 1266
RICE KRISPIES 114 88 1248 1450
SHREDDED WHEAT 65 62 1062 1189
SMART START 124 90 920 1134
SPECIAL K 125 82 1184 1391
Total 1456 1061 19333 21850

Comments:

- Advertisement category B doesn't have a large enough sample size for all cereal categories

- Froot Loops cereal sells the most when the advertisement category is A

- Frosted Flakes appear to sell the most when the advertisement category is B

- Frosted Flakes cereal sells the most when the advertisement category is none, followed closely by Froot Loops

3.1.5 brand

3.1.5.1 brand and flavor

# creates table of counts
brand_flavor <- gm_joined_data %>%
  tabyl(flavor, brand) %>%
  adorn_totals(where = c("row", "col")) 
flavor GENERAL MILLS KELLOGGS POST Total
CINNAMON TOAST 1834 0 0 1834
COCOA 1020 881 0 1901
FRUIT 0 2192 0 2192
REGULAR 1203 5135 2478 8816
TOASTED 3132 3975 0 7107
Total 7189 12183 2478 21850

Comments:

- When the brand is General Mills, there is a small sample size for Fruit flavored sales
  - Toasted flavor sold the most

- When the brand is Kelloggs, there is a small sample size for Cinnamon Toast flavored sales and potentially a small sample for Cocoa flavored sales
  - Regular flavor sold the most

- When the brand is Post the only flavor sold was Regular

3.1.5.2 brand and package

# creates table of counts
brand_package <- gm_joined_data %>%
  tabyl(package, brand) %>%
  adorn_totals(where = c("row", "col")) 
package GENERAL MILLS KELLOGGS POST Total
BOX 7025 11803 2478 21306
CUP 164 380 0 544
Total 7189 12183 2478 21850

Comments:

- When the brand is General Mills, there is a small sample size for Cup packaging
- When the brand is Kelloggs, there is a potentially a small sample for Cup packaging
- When the brand is Post the only packaging sold was Box

3.1.5.3 brand and cereal

# creates table of counts
brand_cereal <- gm_joined_data %>%
  tabyl(cereal, brand) %>%
  adorn_totals(where = c("row", "col")) 
cereal GENERAL MILLS KELLOGGS POST Total
CHEERIOS 1458 0 0 1458
CINNAMON TST CR 1834 0 0 1834
COCOA KRISPIES 0 881 0 881
COCOA PUFFS 1020 0 0 1020
FROOT LOOPS 0 2192 0 2192
FROSTED FLAKES 0 2295 0 2295
FROSTED MINI WHEATS 0 1574 0 1574
GRAPE NUTS 0 0 1289 1289
KIX 1196 0 0 1196
LUCKY CHARMS 1681 0 0 1681
RAISIN BRAN 0 1266 0 1266
RICE KRISPIES 0 1450 0 1450
SHREDDED WHEAT 0 0 1189 1189
SMART START 0 1134 0 1134
SPECIAL K 0 1391 0 1391
Total 7189 12183 2478 21850

Comments:

- When the brand is General Mills, the only cereals sold are: Cheerios, Cinnamon TST CR, Cocoa Puffs, Kix, and Lucky Charms
  - Cinnamon TST CR sold the most
  - Cocoa Puffs sold the least

- When the brand is Kelloggs, the only cereals sold are: Cocoa Krispies, Froot Loops, Frosted Flakes, Frosted Mini Wheats, Rasin Bran, Rice Krispies, Smart Start, and Special K
  - Frosted Flakes sold the most
  - Cocoa Krispies sold the least

- When the brand is Post, the only cereals sold are: Grape Nuts and Shredded Wheats
  - Grape Nuts sold the most
  - Shredded Wheat sold the least

3.1.6 flavor

3.1.6.1 flavor and package

# creates table of counts
flavor_package <- gm_joined_data %>%
  tabyl(package, flavor) %>%
  adorn_totals(where = c("row", "col")) 
package CINNAMON TOAST COCOA FRUIT REGULAR TOASTED Total
BOX 1774 1897 2099 8644 6892 21306
CUP 60 4 93 172 215 544
Total 1834 1901 2192 8816 7107 21850

Comments:

- Box sold the most for each flavor
- When the packaging is Cup the flavors of Cinnamon Toast, Cocoa, Fruit, and Regular have small sample sizes

3.1.6.2 flavor and cereal

# creates table of counts
flavor_cereal <- gm_joined_data %>%
  tabyl(cereal, flavor) %>%
  adorn_totals(where = c("row", "col")) 
cereal CINNAMON TOAST COCOA FRUIT REGULAR TOASTED Total
CHEERIOS 0 0 0 4 1454 1458
CINNAMON TST CR 1834 0 0 0 0 1834
COCOA KRISPIES 0 881 0 0 0 881
COCOA PUFFS 0 1020 0 0 0 1020
FROOT LOOPS 0 0 2192 0 0 2192
FROSTED FLAKES 0 0 0 2295 0 2295
FROSTED MINI WHEATS 0 0 0 1574 0 1574
GRAPE NUTS 0 0 0 1289 0 1289
KIX 0 0 0 1196 0 1196
LUCKY CHARMS 0 0 0 3 1678 1681
RAISIN BRAN 0 0 0 1266 0 1266
RICE KRISPIES 0 0 0 0 1450 1450
SHREDDED WHEAT 0 0 0 1189 0 1189
SMART START 0 0 0 0 1134 1134
SPECIAL K 0 0 0 0 1391 1391
Total 1834 1901 2192 8816 7107 21850

Comments:

- When the flavor is Cinnamon Toast, Cinnamon TST CR is the only cereal sold

- When the flavor is Cocoa, Cinnamon Cocoa Krispies and Coco Puffs are the only cereal sold
  - Cocoa Puffs sold the most
  - Cocoa Krispies sold the least

- When the flavor is Fruit, Froot Loops is the only cereal sold

- When the flavor is Regular, the only cereals sold are: Cheerios, Frosted Flakes, Frosted Mini Wheats, Grape Nuts, Kix, Lucky Charms, Rasin Bran, and Shredded Wheat
  - Frosted Flakes sold the most
  - Lucky Charms sold the least

- When the flavor is Toasted, the only cereals sold are: Cheerios, Lucky Charms, Rice Krispies, Smart Start, Special K
  - Frosted Flakes sold the most
  - Smart Start sold the least
  • What’s going on with the few observations for cheerios and lucky charms listed as Regular flavor rather than Toasted?

3.1.7 package

3.1.7.1 package and cereal

# creates table of counts
package_cerial <- gm_joined_data %>%
  tabyl(cereal, package) %>%
  adorn_totals(where = c("row", "col")) 
cereal BOX CUP Total
CHEERIOS 1411 47 1458
CINNAMON TST CR 1774 60 1834
COCOA KRISPIES 877 4 881
COCOA PUFFS 1020 0 1020
FROOT LOOPS 2099 93 2192
FROSTED FLAKES 2158 137 2295
FROSTED MINI WHEATS 1539 35 1574
GRAPE NUTS 1289 0 1289
KIX 1196 0 1196
LUCKY CHARMS 1624 57 1681
RAISIN BRAN 1266 0 1266
RICE KRISPIES 1425 25 1450
SHREDDED WHEAT 1189 0 1189
SMART START 1134 0 1134
SPECIAL K 1305 86 1391
Total 21306 544 21850

Comments:

- Box sold the most for each flavor
- When the packaging is Cup, no category of cereal has enough observations 

3.2 Quantitative Variables

3.2.1 Correlation Table

# create correlation matrix
correlation_matrix <- gm_joined_data %>%
  mutate(week = factor(week)) %>%
  select(-iri_key) %>%
  select_if(is.numeric) %>%
  cor() %>%
  round(2)
units price volume
units 1.00 -0.19 0.02
price -0.19 1.00 0.54
volume 0.02 0.54 1.00
  • there is a negative correlation coefficient ( -0.19) between price and units
  • there is a positive correlation coefficient ( 0.54) between price and volume
  • there is a small, positive correlation between volume and units ( 0.02)

3.3 Questions/Comments

  • What’s going on with the four observations for cheerios and three observations for lucky charms that are listed as Regular flavor rather than Toasted?

    • is this a data entry error?
  • why is there a negative correlation coefficient between price and units?

4 Base EDA Step 4: Multi-Variate Graphical EDA

4.1 Categorical Variables

- visual of store (iri_key) isn't clear as a bar graph, so I'm not sure looking into it again here would really show us much
# build mosiac plot
gm_joined_data %>%
group_by(iri_key, promo) %>% 
summarise(count = n()) %>% 
ggplot(aes(iri_key, promo)) + 
geom_tile(aes(fill = count)) 
## group_by: 2 grouping variables (iri_key, promo)
## summarise: now 2,656 rows and 3 columns, one group variable remaining (iri_key)

# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
  ggplot(mapping = aes(x = iri_key, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- these plots confirm that visualizing iri_key isn't very telling

4.1.1 week

4.1.1.1 week and promo

# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
  ggplot(mapping = aes(x = factor(week), fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for non-promotion weeks than when there was an in store promotion in this data set

4.1.1.2 week and ad

# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
  ggplot(mapping = aes(x = factor(week), fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- when looking at advertisements over weeks we may want to split them out from when there is not an advertisement in order to evaluate performance of advertisement A against advertisement B
# build bar graph with counts for when there is an advertisement only - first half of the weeks
gm_joined_data %>%
  filter(ad %in% c("A", "B"), week %in% c(1:26)) %>%
  ggplot(mapping = aes(x = factor(week), fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

# second half of the weeks
gm_joined_data %>%
  filter(ad %in% c("A", "B"), week %in% c(27:52)) %>%
  ggplot(mapping = aes(x = factor(week), fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()
## filter: removed 20,790 rows (95%), 1,060 rows remaining

Comments:

- week 39 (typically around mid to late September) has a low count of medium sized advertisements

- there appears to be more medium advertisements (A) than small advertisements (B) per week, but that isn't consistent across all weeks

4.1.1.3 week and brand

# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
  ggplot(mapping = aes(x = factor(week), fill = brand)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms across all weeks there are more Kelloggs brand than GM brand and more GM than Post brand observations in the data

4.1.1.4 week and flavor

# build bar graph with counts for first 26 weeks
gm_joined_data %>%
  filter( week %in% c(1:26)) %>%
  ggplot(mapping = aes(x = factor(week), fill = flavor)) +
  geom_bar(position = "dodge") +
  coord_flip()

# build bar graph with counts for last 26 weeks
gm_joined_data %>%
  filter( week %in% c(27:52)) %>%
  ggplot(mapping = aes(x = factor(week), fill = flavor)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms Regular followed by Toasted flavors typically have the most observations each week

- in week 52 toasted has more observations more than regular

- in week 10 toasted and regular seem to have the same number of observations

4.1.1.5 week and package

# build bar graph with counts
gm_joined_data %>%
  ggplot(mapping = aes(x = factor(week), fill = package)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms box packaging sold the most each week

4.1.1.6 week and cereal

# build mosaic plot first 26 weeks
gm_joined_data %>%
  filter(week %in% c(1:26)) %>%
  group_by(week, cereal) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(factor(week), cereal)) + 
  geom_tile(aes(fill = count)) 

# build mosaic plot las 26 weeks
gm_joined_data %>%
  filter(week %in% c(27:52)) %>%
  group_by(week, cereal) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(factor(week), cereal)) + 
  geom_tile(aes(fill = count)) 

Comments:

- confirms box frosted flakes and froot loops typically had the most observations 

- in the first 26 weeks of the year, Cocoa Krispies typically had the least observations

- in the last 26 weeks it looks like Cocoa Puffs typically had the least sales

4.1.2 promo

4.1.2.1 promo and ad

# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
  ggplot(mapping = aes(x = ad, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- when you only look at the promotions for ad A compared with ad B there appears to be a similar pattern of promotions. It might be interesting to look at this further over weeks or stores
# build bar graph with counts when ad is A or B
gm_joined_data %>%
  filter(ad != "NONE") %>%
  ggplot(mapping = aes(x = ad, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

# build mosaic plot with promotion and either Ad A or Ad B
gm_joined_data %>%
  filter(ad != "NONE") %>%
  group_by(ad, promo) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(ad, promo)) + 
  geom_tile(aes(fill = count)) 

Comments:

- interesting to see that there are more observations of ad B and A when there is an also in store promotion being ran than when there isn't

4.1.2.2 promo and brand

# observation counts for promotions over brands
gm_joined_data %>%
  ggplot(mapping = aes(x = brand, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each brand when there isn't an in store promotion

4.1.2.3 promo and flavor

# build bar graph with counts for promotions over flavors
gm_joined_data %>%
  ggplot(mapping = aes(x = flavor, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each flavor when there isn't an in store promotion

4.1.2.4 promo and package

# build bar graph with counts for promotions over package types
gm_joined_data %>%
  ggplot(mapping = aes(x = package, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each package type when there isn't an in store promotion

4.1.2.5 promo and cereal

# build bar graph with counts for promo over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = cereal, fill = promo)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each cereal when there isn't an in store promotion

4.1.3 ad

4.1.3.1 ad and brand

# build bar graph with counts for ad caegories over brands
gm_joined_data %>%
  ggplot(mapping = aes(x = brand, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each brand when there isn't an advertisement
# build bar graph with counts for ad categories A & B over brands
gm_joined_data %>%
  filter(ad != "NONE") %>%
  ggplot(mapping = aes(x = brand, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- Post has more observations for advertisement B than A when there is an advertisement 

- Kelloggs and GM have more observations for advertisement A when there is an ad

4.1.3.2 ad and flavor

# build bar graph with counts for ad categories over flavors
gm_joined_data %>%
  ggplot(mapping = aes(x = flavor, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each flavor when there isn't an advertisement
# build bar graph with counts for  ad categories A & B over flavors
gm_joined_data %>%
  filter(ad != "NONE") %>%
  ggplot(mapping = aes(x = flavor, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- when there is an advertisement, there are more observations for ad A across all flavors

4.1.3.3 ad and package

# build bar graph with counts for ad categories over package types
gm_joined_data %>%
  ggplot(mapping = aes(x = package, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each flavor when there isn't an advertisement

4.1.3.4 ad and cereal

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = cereal, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms there are more observations for each cereal when there isn't an advertisement
# build bar graph with counts for  ad categories A & B over cereal types
gm_joined_data %>%
  filter(ad != "NONE") %>%
  ggplot(mapping = aes(x = cereal, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- When there is an advertisement, all cereals have more observations for advertisement A except for grape nuts 
- Grape nuts have more observations for ad B

4.1.4 brand

4.1.4.1 brand and flavor

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = flavor, fill = brand)) +
  geom_bar(position = "dodge") +
  coord_flip()

- confirms all three brands sell regular flavored products and that kelloggs sells the most of that flavor

- confirms Post only sells regular flavored products, GM only sells toasted, regular, cocoa and cinnamon toast flavored products and Kelloggs only sells toasted, regular, cocoa, and cinnamon toast flavored products

- if we're trying to conduct a comparison of flavors by brands, we're going to run into some issues

4.1.4.2 brand and package

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = package, fill = brand)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms all three brands have more obsercations for boxed products 

4.1.4.3 brand and cereal

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = cereal, fill = brand)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms kelloggs' frosted flakes and froot loops cereals have the most observations

4.1.5 flavor

4.1.5.1 flavor and package

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = flavor, fill = package)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms for each flavor, boxed packaging has the most observations

4.1.5.2 flavor and cereal

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = cereal, fill = flavor)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms something is wrong with Lucky Charms and Cheerio observations recorded as regular flavor (should be toasted)

4.1.6 package

4.1.6.1 package and cereal

# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
  ggplot(mapping = aes(x = cereal, fill = package)) +
  geom_bar(position = "dodge") +
  coord_flip()

Comments:

- confirms most observations for each cereal were packaged in a box 

4.2 Quantitative Variables

# set up colors
fill_color <- "#4271AE"
line_color <- "#1F3552"

4.2.1 units

4.2.1.1 units and price

- negative correlation coefficient between units and price
# create box plot
units_price_box <-  gm_joined_data %>% 
  group_by(units) %>%
  ggplot(mapping = aes(x = units, group = units, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color)

# create scatter plot
units_price_scatter <- gm_joined_data %>% 
  ggplot(mapping = aes(x = units, y = price)) +
  geom_point() +
  geom_smooth(method = "lm")

# output
units_price_box + units_price_scatter

Comments:

- confirms there is some kind of negative relationship between units and price 

- need to look at this over cereal types

4.2.1.2 units and volume

- slight positive correlation coefficient between units and volume
# create box plot
units_vol_box <-  gm_joined_data %>% 
  group_by(units) %>%
  ggplot(mapping = aes(x = units, group = units, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color)


# create scatter plot
units_vol_scatter <- gm_joined_data %>% 
  ggplot(mapping = aes(x = units, y = volume)) +
  geom_point() +
  geom_smooth(method = "lm")

# output
units_vol_box + units_vol_scatter

Comments:

- confirms there is some kind of positive relationship between units and volume, but this looks odd 
- average volume is not consistently increasing with units in a linear manner

- need to look at this over cereal types

4.2.2 price

4.2.2.1 price and volume

- slight positive correlation coefficient between units and volume
# create box plot
price_vol_box <-  gm_joined_data %>% 
  group_by(volume) %>%
  ggplot(mapping = aes(x = volume, group = volume, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color)


# create scatter plot
price_vol_scatter <- gm_joined_data %>% 
  ggplot(mapping = aes(x = volume, y = price)) +
  geom_point() +
  geom_smooth(method = "lm")

# output
price_vol_box + price_vol_scatter

Comments:

- confirms there is some kind of positive relationship between price and volume

4.3 Categorical and Quantitative Variables

4.3.1 Price interactions

# boxplot of price per brand
price_promo <- gm_total_sales %>% 
  ggplot(mapping = aes(x = promo, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

price_ad <- gm_total_sales %>% 
  ggplot(mapping = aes(x = ad, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

price_brand <- gm_total_sales %>% 
  ggplot(mapping = aes(x = brand, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

price_flavor <- gm_total_sales %>% 
  ggplot(mapping = aes(x = flavor, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

price_package <- gm_total_sales %>% 
  ggplot(mapping = aes(x = package, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

# this graph doesn't look very good, its hard ot read
price_cereal <- gm_total_sales %>% 
  ggplot(mapping = aes(x = cereal, y = price)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

(price_promo + price_package) / (price_flavor) /( price_brand  + price_ad ) / price_cereal

4.3.2 Unit Interactions

# boxplot of unit per brand
unit_promo <- gm_total_sales %>%
  ggplot(mapping = aes(x = promo)) +
  geom_bar(fill = fill_color, colour = line_color) 

unit_ad <- gm_total_sales %>% 
  ggplot(mapping = aes(x = ad)) +
  geom_bar(fill = fill_color, colour = line_color) 

unit_brand <- gm_total_sales %>% 
  ggplot(mapping = aes(x = brand)) +
  geom_bar(fill = fill_color, colour = line_color) 

unit_flavor <- gm_total_sales %>% 
  ggplot(mapping = aes(x = flavor)) +
  geom_bar(fill = fill_color, colour = line_color) 

unit_package <- gm_total_sales %>% 
  ggplot(mapping = aes(x = package)) +
  geom_bar(fill = fill_color, colour = line_color) 

# this graph doesn't look very good, its hard to read
unit_cereal <- gm_total_sales %>% 
  ggplot(mapping = aes(x = cereal)) +
  geom_bar(fill = fill_color, colour = line_color) 

(unit_promo + unit_package) / (unit_flavor) /( unit_brand  + unit_ad ) / unit_cereal

4.3.3 Volume Interations

# boxplot of volume per brand
volume_promo <- gm_total_sales %>% 
  ggplot(mapping = aes(x = promo, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

volume_ad <- gm_total_sales %>% 
  ggplot(mapping = aes(x = ad, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

volume_brand <- gm_total_sales %>% 
  ggplot(mapping = aes(x = brand, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

volume_flavor <- gm_total_sales %>% 
  ggplot(mapping = aes(x = flavor, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

volume_package <- gm_total_sales %>% 
  ggplot(mapping = aes(x = package, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

# this graph doesn't look very good, its hard ot read
volume_cereal <- gm_total_sales %>% 
  ggplot(mapping = aes(x = cereal, y = volume)) +
  geom_boxplot(fill = fill_color, colour = line_color) 

(volume_promo + volume_package) / (volume_flavor) /( volume_brand  + volume_ad ) / volume_cereal

4.3.4 Brand interaction futher analysis

# Brand and Price
# brand total sales
brand_sales <- gm_total_sales %>%
  group_by(brand) %>%
  summarise(median_price = median(price),
            mean_price = mean(price),
            totalunits = n(),
            totalsales = sum(price),
            totalvolume = sum(volume))

# build bar graph with total sales per brand
brand_price <- brand_sales %>%
  ggplot(mapping = aes(x = brand, y = totalsales, fill = brand)) +
  geom_col(stat= 'identity', position = "dodge") +
  scale_y_continuous(labels=scales::dollar_format()) +
  coord_flip()

## brand and units
# build bar graph with total units per brand
brand_units <- brand_sales %>%
  ggplot(mapping = aes(x = brand, y = totalunits, fill = brand)) +
  geom_col(stat= 'identity', position = "dodge") +
  coord_flip()

## brand and volume
# build bar graph with total units per brand
brand_volume <- brand_sales %>%
  ggplot(mapping = aes(x = brand, y = totalvolume, fill = brand)) +
  geom_col(stat= 'identity', position = "dodge") +
  coord_flip()

brand_price / brand_units / brand_volume

4.3.5 cereal interactions further analysis

# Cereal and Price
# brand total sales
cereal_sales <- gm_total_sales %>%
  group_by(brand, cereal) %>%
  summarise(median_price = median(price),
            mean_price = mean(price),
            totalunits = n(),
            totalsales = dollar(sum(price)),
            totalvolume = sum(volume))

# build bar graph with total sales per cereal
cereal_sales %>%
  ggplot(mapping = aes(x = cereal, y = totalsales, fill = brand)) +
  geom_col(stat= 'identity', position = "dodge") +
  coord_flip()

## cereal and units
# build bar graph with total units per brand
cereal_sales %>%
  ggplot(mapping = aes(x = cereal, y = totalunits, fill = brand)) +
  geom_col(stat= 'identity', position = "dodge") +
  coord_flip()

## cereal and volume
# build bar graph with total units per brand
cereal_sales %>%
  ggplot(mapping = aes(x = cereal, y = totalvolume, fill = brand)) +
  geom_col(stat= 'identity', position = "dodge") +
  coord_flip()

4.4 Questions/Comments

  • we could look at promotion performance across stores since the promotion is reliant upon the store

  • Which ad had better performance A or B

  • need to look at units per cereal and find total sales price/ total sales per cereal

5 Detailed EDA

  • Questions to be answered outside of the data

    • Are the four observations for cheerios and three observations for lucky charms that are listed as Regular flavor rather than Toasted data entry errors?

    • Is this data set representative of the marketplace for these three brands?

      • is this a typical distribution of promotions to no promotions
      • is this the typical distribution between advertisement categories per year for GM
      • how about packaging? is there usually a significantly higher proportion of boxed products sold?
    • Can we get actual dates associated with these observations rather than just week numbers recorded in this data set?

      • conducting a holiday or quarterly analysis could be beneficial for identifying seasonal trends
      • it would also help with identifying our competitor’s promo and advertisement schedules
  • Questions that can be answered from the data?

    • How well do promotions perform? (promotion performance)

      • Performance between brands?
      • How about only looking at General Mills promotions?
      • How do promotion’s perform by flavor between brands?
      • How do GM’s promotions perform between flavors?
    • How well do advertisements perform? (advertisement performance)

      • Performance across brands?
      • How about GM’s advertisements based on cereal?
      • How about GM’s advertisements based on flavors?
    • What about overall sales performance for each brand? (overall cereal performance)

      • looking at performance in terms of units sold and sales revenue
    • What is the advertisement and in store promotion strategy for each brand? (competitive strategy performance)

    • Why is there a negative correlation coefficient between price and units?

5.1 Promotion Performance

5.1.1 Promotion Performance Among Brands

# GM promotional sales Figures in relation to others
gm_total_sales %>%
  filter(promo == 1) %>%
  ggplot(mapping = aes(x = brand, fill = brand)) +
  geom_bar() +
  labs(y = "Units Sold")

  • General Mills sits between Kelloggs and Post in promotional units sold

    • Consistent with non-promitional sales figures

5.1.2 Promotion Performance Within General Mills

# Overall sales profile of units and sold and revenue between promotions and non-promotion
gm_cereal_Punits <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 1) %>%
  ggplot(mapping = aes(x = cereal, fill = cereal)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Promotion") +
  labs(y = "Units Sold")

gm_cereal_Runits <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 0) %>%
  ggplot(mapping = aes(x = cereal, fill = cereal)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("No Promotion") +
  labs(y = "Units Sold")

gm_cereal_Prev <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 1) %>%
  ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(y = "Revenue")

gm_cereal_Rrev <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 0) %>%
  ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(y = "Revenue")

(gm_cereal_Punits +  gm_cereal_Runits) / (gm_cereal_Prev + gm_cereal_Rrev)

  • The overall sales profile for General Mills changes during a promotion. when promotions are being ran Lucky Charms moves from the third highest seller to the top seller

    • Cheerios moves from first to third
    • Cinnamon Toast Crunch stays in second

5.1.3 Promotional Flavor Analysis Between Brands

# Examine regular, toasted, and cocoa flavor promotion sales across brands
reg_brands_units <- gm_total_sales %>%
  filter(flavor == "REGULAR", promo == 1) %>%
  ggplot(mapping = aes(x = brand, fill = brand)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Regular on Promotion") +
  labs(y = "Units Sold")

toasted_brands_units <- gm_total_sales %>%
  filter(flavor == "TOASTED", promo == 1) %>%
  ggplot(mapping = aes(x = brand, fill = brand)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Toasted on Promotion") +
  labs(y = "Units Sold")

cocoa_brands_units <- gm_total_sales %>%
  filter(flavor == "COCOA", promo == 1) %>%
  ggplot(mapping = aes(x = brand, fill = brand)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Cocoa on Promotion") +
  labs(y = "Units Sold")

reg_brands_units / toasted_brands_units / cocoa_brands_units

  • General Mills trails units sold in regular flavored cereal to both Post and Kelloggs

    • General Mills also trails in the Toasted segment
    • General Mills sells more units in the Cocoa segment compared to Kelloggs

5.1.4 Promotional Flavor Analysis Within Brand

# Overall sales profile of units and sold and revenue between promotions and non-promotion based on flavor
gm_flavor_Punits <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 1) %>%
  ggplot(mapping = aes(x = flavor, fill = flavor)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Promotion") +
  labs(y = "Units Sold")

gm_flavor_Runits <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 0) %>%
  ggplot(mapping = aes(x = flavor, fill = flavor)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("No Promotion") +
  labs(y = "Units Sold")

gm_flavor_Prev <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 1) %>%
  ggplot(mapping = aes(x = flavor, y = price, fill = flavor)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(y = "Revenue")

gm_flavor_Rrev <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", promo == 0) %>%
  ggplot(mapping = aes(x = flavor, y = price, fill = flavor)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(y = "Revenue")

(gm_flavor_Punits +  gm_flavor_Runits) / (gm_flavor_Prev + gm_flavor_Rrev)

  • Cocoa flavored cereal is the most impacted by promotions

    • Cocoa shifts up one position from lowest performing cereal in revenue and units sold when on promotion
    • All other cereals remain proportionally consistent between promotions and non-promotions

5.2 Ad performance

5.2.1 Ad performance across brands

# GM ad sales Figures in relation to others
gm_total_sales %>%
  filter(ad == "A" | ad == "B") %>%
  ggplot(mapping = aes(x = brand, fill = ad)) +
  geom_bar(position = "dodge") +
  labs(y = "Units Sold")

  • The pattern of General Mills sitting between Kelloggs and Post holds consistent

    • For both small and medium adverstiements GM sits between both competitors in overall units sold

5.2.2 Ad performance within brand based on cereal

# Examine ad impacts on GM cereal sales and revenue


gm_cereal_Aunits <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
  ggplot(mapping = aes(x = cereal, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip() +
  ggtitle("Ad") +
  labs(y = "Units Sold")


gm_cereal_Arevenue <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
  group_by(ad, cereal) %>%
  summarise(rev = sum(price)) %>%
  ggplot(mapping = aes(x = cereal, y = rev, fill = ad)) +
  geom_bar(position = "dodge", stat = "identity", show.legend = FALSE) +
  coord_flip() +
  ggtitle("Ad") +
  labs(y = "Revenue") +
  scale_y_continuous(labels=scales::dollar_format())

gm_cereal_Aunits / gm_cereal_Arevenue

5.2.3 Ad performance within brand based on flavor

# Examine ad impacts on GM cereal flavor sales and revenue
gm_flavor_Aunits <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
  ggplot(mapping = aes(x = flavor, fill = ad)) +
  geom_bar(position = "dodge") +
  coord_flip() +
  ggtitle("Ad") +
  labs(y = "Units Sold")

gm_flavor_Arevenue <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
  group_by(ad, flavor) %>%
  summarise(rev = sum(price)) %>%
  ggplot(mapping = aes(x = flavor, y = rev, fill = ad)) +
  geom_bar(position = "dodge", stat = "identity", show.legend = FALSE) +
  coord_flip() +
  ggtitle("Ad") +
  labs(y = "Revenue") +
  scale_y_continuous(labels=scales::dollar_format())

gm_flavor_Aunits / gm_flavor_Arevenue

5.3 Overall Cereal Performance among brands

5.3.1 General Mills

# Cereal profile
gm_cereal_units <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS") %>%
  ggplot(mapping = aes(x = cereal, fill = cereal)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Total Sales") +
  labs(y = "Units Sold")

gm_cereal_revenue <- gm_total_sales %>%
  filter(brand == "GENERAL MILLS") %>%
  ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
  geom_bar(stat = "Identity", show.legend = FALSE) +
  coord_flip() +
  ggtitle("") +
  labs(y = "Revenue")

gm_cereal_units / gm_cereal_revenue

5.3.2 Kelloggs

# Cereal profile
kg_cereal_units <- gm_total_sales %>%
  filter(brand == "KELLOGGS") %>%
  ggplot(mapping = aes(x = cereal, fill = cereal)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Total Sales") +
  labs(y = "Units Sold")

kg_cereal_revenue <- gm_total_sales %>%
  filter(brand == "KELLOGGS") %>%
  ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
  geom_bar(stat = "Identity", show.legend = FALSE) +
  coord_flip() +
  ggtitle("") +
  labs(y = "Revenue")

kg_cereal_units / kg_cereal_revenue

5.3.3 Post

# Cereal profile
pt_cereal_units <- gm_total_sales %>%
  filter(brand == "POST") %>%
  ggplot(mapping = aes(x = cereal, fill = cereal)) +
  geom_bar(show.legend = FALSE) +
  coord_flip() +
  ggtitle("Total Sales") +
  labs(y = "Units Sold")

pt_cereal_revenue <- gm_total_sales %>%
  filter(brand == "POST") %>%
  ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
  geom_bar(stat = "Identity", show.legend = FALSE) +
  coord_flip() +
  ggtitle("") +
  labs(y = "Revenue")

pt_cereal_units / pt_cereal_revenue

5.4 Competitive strategy performance

5.4.1 Promotions

promotions <- gm_joined_data %>%
  filter(promo == 1) %>%
  group_by(brand, iri_key) %>%
  summarise(number_of_promo_weeks_per_store = n())

promotions <- promotions %>%
  group_by(brand) %>%
  summarise(number_of_stores_with_promos = n(),
            total_promo_weeks_across_stores = sum(number_of_promo_weeks_per_store))

promo_stores <- promotions %>%
  ggplot(mapping = aes(x = brand, y = number_of_stores_with_promos, fill = brand )) +
  geom_bar(stat = "identity", position = "dodge",  show.legend = FALSE) +
  labs(title = "Number of Stores with Promotions", y = "Count of Stores", x = "") +
  coord_flip()

promo_weeks <- promotions %>%
  ggplot(mapping = aes(x = brand, y = total_promo_weeks_across_stores, fill = brand )) +
  geom_bar(stat = "identity", position = "dodge",  show.legend = FALSE) +
  labs(title = "Number of weeks with Promotions", subtitle = "total promo weeks across all stores", y = "Count of Promo Weeks", x = "") +
  coord_flip()

promo_stores / promo_weeks

# create customized theme function starting with theme_classic()
clean_theme <- theme_classic() +
  theme(legend.direction = "horizontal", # create horizontal legend
        legend.position = "bottom", # put legend at bottom of graph
        legend.justification='left', # align legend to the left
        legend.title = element_blank(), # remove legend title
        axis.line.y = element_blank(), # remove y-axis line
        axis.ticks.y = element_blank(), # remove y-axis ticks
        axis.ticks.x = element_blank(), # remove x-axis ticks
        plot.title = element_text(face = "bold", size = 15)) # make graph title bold and a larger font

5.4.2 Toasted flavored products

5.4.2.1 Promotion Analysis

- set up data
toasted_promo_price_weekly <- gm_total_sales %>%
  filter(promo == 1, flavor == "TOASTED") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_promo_price = median(price)) %>%
  select(-flavor)

toasted_average_non_promo_weekly_price<- gm_total_sales %>%
  filter(promo == 0, flavor == "TOASTED") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_non_promo_price = median(price))%>%
  select(-flavor)


toasted_average_promo_weekly_units <- gm_joined_data %>%
  filter(promo == 1, flavor == "TOASTED") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_promo_units = median(units))%>%
  select(-flavor)


toasted_average_non_promo_weekly_units <- gm_joined_data %>%
  filter(promo == 0, flavor == "TOASTED") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_non_promo_units = median(units))%>%
  select(-flavor)


toasted_average_promo_weekly_store_count <- gm_joined_data %>%
  filter(promo == 1, flavor == "TOASTED") %>%
  group_by(brand, week, flavor) %>%
  summarise(promo_store_count = n())%>%
  select(-flavor)


toasted_average_no_promo_weekly_store_count <-gm_joined_data %>%
  filter(promo == 0, flavor == "TOASTED") %>%
  group_by(brand, week, flavor) %>%
  summarise(no_promo_store_count = n())%>%
  select(-flavor)


toasted_weekly_promo_analysis <- right_join(toasted_promo_price_weekly, toasted_average_non_promo_weekly_price, by = c("brand", "week")) 

toasted_weekly_promo_analysis <-  right_join(toasted_weekly_promo_analysis,  toasted_average_promo_weekly_units, by = c("brand", "week"))

toasted_weekly_promo_analysis <-  right_join(toasted_weekly_promo_analysis,  toasted_average_non_promo_weekly_units, by = c("brand", "week"))

toasted_weekly_promo_analysis <-  right_join(toasted_weekly_promo_analysis,  toasted_average_promo_weekly_store_count, by = c("brand", "week"))

toasted_weekly_promo_analysis <-  right_join(toasted_weekly_promo_analysis,  toasted_average_no_promo_weekly_store_count, by = c("brand", "week"))
- Look at Average Weekly Promotion pricing for each brand for toasted products
#Weekly promo prices by brand for regular products
toasted_promo_weekly_price <- toasted_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = average_promo_price, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills toasted products are typically priced lower than \nKelloggs when there is a promotion", subtitle = "Average weekly price for toasted flavored products") +
  ylab("Average Promo Price") +
  xlab("Week") +
  clean_theme

toasted_promo_weekly_price
## `geom_smooth()` using formula 'y ~ x'

- Look at average units sold at promotion prices
#Weekly promo prices by brand for toasted products

min_lim <- min(toasted_weekly_promo_analysis$average_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(toasted_weekly_promo_analysis$average_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])

toasted_most_competitive_promo_price <- toasted_weekly_promo_analysis %>%
  ggplot(mapping = aes(x = average_promo_price, y = average_promo_units, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "Most competitive promotion price is around $2.50 \nfor toasted flavored products") +
  ylab("Units Sold") +
  xlab("Average Price") +
  clean_theme +
  geom_vline(xintercept = 2.50, color = "red") +
  scale_x_continuous(breaks = c(0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3, 3.5),
                     labels=scales::dollar_format(),
                     limits = c(min_lim, 3.5)) 

toasted_most_competitive_promo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).

- Look at the number of stores running promotions for each brand
#Weekly promo prices by brand for regular products
toasted_store_count_promo <- toasted_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = promo_store_count, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills should run promotions in more stores", subtitle = "Kelloggs consistently runs more in store promotions per week \nthan General Mills for toasted products") +
  ylab("Number of stores with promotions") +
  xlab("Week") +
  clean_theme 

toasted_store_count_promo
## `geom_smooth()` using formula 'y ~ x'

- show non-promotion analysis in some cohesive way  (below is not it)
toasted_promo_weekly_price + toasted_most_competitive_promo_price / toasted_store_count_promo
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'

5.4.2.2 Non-Promotion Analysis

- Look at Average Weekly pricing for each brand when there isnt a promotion
#Weekly non=promo prices by brand for regular products
toasted_nonpromo_weekly_price <- toasted_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = average_non_promo_price, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "Average price for toasted flavored products \nwhen there is no promotion") +
  ylab("Average Non-promo Price") +
  xlab("Week") +
  clean_theme

toasted_nonpromo_weekly_price
## `geom_smooth()` using formula 'y ~ x'

- Look at average units sold for non-promotion prices
#Weekly promo prices by brand for toasted products
min_lim <- min(toasted_weekly_promo_analysis$average_non_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(toasted_weekly_promo_analysis$average_non_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])


toasted_most_competitive_nonpromo_price <- toasted_weekly_promo_analysis %>%
  ggplot(mapping = aes(x = average_non_promo_price, y = average_non_promo_units, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "Most competitive non-promotion price is around $3.59 \nfor toasted flavored products") +
  ylab("Units Sold") +
  xlab("Average Price") +
  clean_theme +
  geom_vline(xintercept = 3.59, color = "red")+
  scale_x_continuous(labels=scales::dollar_format(),
                     limits = c(min_lim, max_lim)) 

toasted_most_competitive_nonpromo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

- Look at the number of stores selling toasted products each week for each brand
toasted_store_count_nonpromo <- toasted_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = no_promo_store_count, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills should sell toasted products in more stores", subtitle = "Kelloggs typically sells their products in \nmore in stores than General Mills throughout the year") +
  ylab("Number of stores with promotions") +
  xlab("Week") +
  clean_theme 
toasted_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'

- show non-promotion analysis in some cohesive way  (below is not it)
toasted_nonpromo_weekly_price + toasted_most_competitive_nonpromo_price / toasted_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'

5.4.3 Regular Flavored Products

  • Looking at promotion pricing, units sold and number of stores with weekly in-store promotions

    • Set up data
regular_promo_price_weekly <- gm_joined_data %>%
  filter(promo == 1, flavor == "REGULAR") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_promo_price = median(price)) %>%
  select(-flavor)

regular_average_non_promo_weekly_price<- gm_joined_data %>%
  filter(promo == 0, flavor == "REGULAR") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_non_promo_price = median(price)) %>%
  select(-flavor)

regular_average_promo_weekly_units <- gm_joined_data %>%
  filter(promo == 1, flavor == "REGULAR") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_promo_units = median(units)) %>%
  select(-flavor)

regular_average_non_promo_weekly_units <- gm_joined_data %>%
  filter(promo == 0, flavor == "REGULAR") %>%
  group_by(brand, week, flavor) %>%
  summarise(average_non_promo_units = median(units)) %>%
  select(-flavor)

regular_average_promo_weekly_store_count <- gm_joined_data %>%
  filter(promo == 1, flavor == "REGULAR") %>%
  group_by(brand, week, flavor) %>%
  summarise(promo_store_count = n()) %>%
  select(-flavor)

regular_average_no_promo_weekly_store_count <-gm_joined_data %>%
  filter(promo == 0, flavor == "REGULAR") %>%
  group_by(brand, week, flavor) %>%
  summarise(no_promo_store_count = n()) %>%
  select(-flavor)

regular_weekly_promo_analysis <- right_join(regular_promo_price_weekly, regular_average_non_promo_weekly_price, by = c("brand", "week")) 

regular_weekly_promo_analysis <-  right_join(regular_weekly_promo_analysis, regular_average_promo_weekly_units, by = c("brand", "week"))

regular_weekly_promo_analysis <-  left_join(regular_weekly_promo_analysis, regular_average_non_promo_weekly_units, by = c("brand", "week"))

regular_weekly_promo_analysis <-  right_join(regular_weekly_promo_analysis, regular_average_promo_weekly_store_count, by = c("brand", "week"))

regular_weekly_promo_analysis <-  left_join(regular_weekly_promo_analysis, regular_average_no_promo_weekly_store_count, by = c("brand", "week"))

5.4.3.1 Promotion Analysis

- Look at Average Weekly Promotion pricing for each brand
#Weekly promo prices by brand for regular products
regular_promo_weekly_price <- regular_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = average_promo_price, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills regular products are typically priced lower than \nKelloggs when there is a promotion", subtitle = "Average weekly price for regular flavored products") +
  ylab("Average Promo Price") +
  xlab("Week") +
  clean_theme

regular_promo_weekly_price
## `geom_smooth()` using formula 'y ~ x'

- Look at average units sold at promotion prices
#Weekly promo prices by brand for toasted products

min_lim <- min(regular_weekly_promo_analysis$average_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(regular_weekly_promo_analysis$average_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])

regular_most_competitive_promo_price <- regular_weekly_promo_analysis %>%
  ggplot(mapping = aes(x = average_promo_price, y = average_promo_units, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "Most competitive promotion price is around $2.74 \nfor regular flavored products") +
  ylab("Units Sold") +
  xlab("Average Price") +
  clean_theme +
  geom_vline(xintercept = 2.74, color = "red") +
  scale_x_continuous(breaks = c(0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3, 3.5),
                     labels=scales::dollar_format(),
                     limits = c(min_lim, 3.5)) 

regular_most_competitive_promo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).

- Look at the number of stores running promotions for each brand
#Weekly promo prices by brand for regular products
regular_store_count_promo <- regular_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = promo_store_count, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills should run promotions in more stores", subtitle = "Kelloggs consistently runs more in store promotions per week \nthan General Mills") +
  ylab("Number of stores with promotions") +
  xlab("Week") +
  clean_theme +
  clean_theme

regular_store_count_promo
## `geom_smooth()` using formula 'y ~ x'

- show promotion analysis in some cohesive way  (below is not it)
#regular_promo_weekly_price + regular_most_competitive_promo_price / regular_store_count_promo

5.4.3.2 Non-Promotion Analysis

- Look at Average Weekly pricing for each brand when there isnt a promotion
#Weekly non=promo prices by brand for regular products
regular_nonpromo_weekly_price <- regular_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = average_non_promo_price, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills products are typically priced lower than \nKelloggs when there is no promotion", subtitle = "Average price for regular flavored products") +
  ylab("Average Non-promo Price") +
  xlab("Week") +
  clean_theme

regular_nonpromo_weekly_price
## `geom_smooth()` using formula 'y ~ x'

- Look at average units sold for non-promotion prices
#Weekly promo prices by brand for toasted products

min_lim <- min(regular_weekly_promo_analysis$average_non_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(regular_weekly_promo_analysis$average_non_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])

regular_most_competitive_nonpromo_price <- regular_weekly_promo_analysis %>%
  ggplot(mapping = aes(x = average_non_promo_price, y = average_non_promo_units, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "Most competitive non-promotion price is around $4.17 \nfor regular flavored products") +
  ylab("Units Sold") +
  xlab("Average Price") +
  clean_theme +
  geom_vline(xintercept = 4.17, color = "red") +
  scale_x_continuous(labels=scales::dollar_format(),
                     limits = c(min_lim, max_lim)) 

regular_most_competitive_nonpromo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 30 rows containing non-finite values (stat_smooth).

- Look at the number of stores selling regular products each week for each brand
regular_store_count_nonpromo <- regular_weekly_promo_analysis %>%
  ggplot(mapping= aes(x = week, y = no_promo_store_count, group = brand, color = brand)) +
  geom_smooth(method = "loess", se = FALSE) +
  ggtitle(label = "General Mills should sell products in more stores", subtitle = "Kelloggs consistently sells their products in \nmore in stores than General Mills") +
  ylab("Number of stores with promotions") +
  xlab("Week") +
  clean_theme +
  clean_theme
regular_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'

- show non-promotion analysis in some cohesive way  (below is not it)
regular_nonpromo_weekly_price + regular_most_competitive_nonpromo_price / regular_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 30 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'

6 Statistical EDA

6.1 General Mills Overall Performance

6.1.1 Is there a difference in promotion sales of Generals Mills products compared to the competiton

Null: There is no difference in product sales of General Mills compared to Kelloggs and Post

Alternative: Product sales between General Mills and the competition differs

chisq.test(table(gm_total_sales$promo, gm_total_sales$brand))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_total_sales$promo, gm_total_sales$brand)
## X-squared = 892.64, df = 2, p-value < 0.00000000000000022
  • Small p value so we reject the null that there is no difference in sales

    • From the chi-square test we can not tell directionationly
# Make table of counts to calculate confidence interval
P_B_n <- gm_total_sales %>%
  group_by(promo, brand) %>%
  summarise(n = n())
## group_by: 2 grouping variables (promo, brand)
## summarise: now 6 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
P_B_n_ci <- multinomialCI(t(P_B_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
P_B_tab <- gm_total_sales %>%
  group_by(promo, brand) %>%
  summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (promo, brand)
## summarise: now 6 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
P_B_tab$ci_1 <- round(P_B_n_ci[,1], 3)
P_B_tab$ci_u <- round(P_B_n_ci[,2], 3)

htmlTable(P_B_tab)
promo brand prop ci_1 ci_u
1 0 GENERAL MILLS 0.283 0.28 0.286
2 0 KELLOGGS 0.385 0.382 0.388
3 0 POST 0.06 0.057 0.063
4 1 GENERAL MILLS 0.085 0.082 0.088
5 1 KELLOGGS 0.161 0.158 0.165
6 1 POST 0.025 0.022 0.028
  • All Brands are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
P_B_tab %>% 
  ggplot(aes(x = brand, y = prop, fill = promo)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -.5, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

  • Kelloggs reliably sells the most units in both on and off promotion

    • We reject the null hypothesis that General Mills performs equally to Kelloggs and Post
    • General Mills lags behing kelloggs in both scenarios at a 99% confidence interval
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
  mutate(brand = as.integer(brand), promo = as.integer(promo)) %>%
  select(brand, promo) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • Not practically significant

    • Since the correlation coefficient is small

6.2 General Mills Promotion Performance

6.2.1 Is there a difference in promotion performance (units sold) over cereal types

Null: During a promotion all brand’s cereal performs similarly in units sold

Alternate: Product performance differs among the brands during a promotion

chisq.test(table(gm_total_sales$promo, gm_total_sales$cereal))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_total_sales$promo, gm_total_sales$cereal)
## X-squared = 3424.1, df = 14, p-value < 0.00000000000000022
  • The p value is small so we reject the null that cereals perform similarly in units sold
# Make table of counts to calculate confidence interval
P_C_n <- gm_total_sales %>%
  group_by(promo, cereal) %>%
  summarise(n = n())
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 30 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
P_C_n_ci <- multinomialCI(t(P_C_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
P_C_tab <- gm_total_sales %>%
  group_by(promo, cereal) %>%
  summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 30 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
P_C_tab$ci_1 <- round(P_C_n_ci[,1], 3)
P_C_tab$ci_u <- round(P_C_n_ci[,2], 3)

htmlTable(P_C_tab)
promo cereal prop ci_1 ci_u
1 0 CHEERIOS 0.091 0.089 0.093
2 0 CINNAMON TST CR 0.066 0.064 0.068
3 0 COCOA KRISPIES 0.015 0.013 0.017
4 0 COCOA PUFFS 0.028 0.026 0.03
5 0 FROOT LOOPS 0.048 0.046 0.05
6 0 FROSTED FLAKES 0.084 0.082 0.086
7 0 FROSTED MINI WHEATS 0.06 0.058 0.062
8 0 GRAPE NUTS 0.035 0.033 0.037
9 0 KIX 0.038 0.036 0.04
10 0 LUCKY CHARMS 0.06 0.058 0.062
11 0 RAISIN BRAN 0.052 0.051 0.054
12 0 RICE KRISPIES 0.056 0.054 0.058
13 0 SHREDDED WHEAT 0.025 0.023 0.027
14 0 SMART START 0.022 0.02 0.024
15 0 SPECIAL K 0.048 0.046 0.049
16 1 CHEERIOS 0.018 0.016 0.02
17 1 CINNAMON TST CR 0.02 0.018 0.022
18 1 COCOA KRISPIES 0.009 0.007 0.011
19 1 COCOA PUFFS 0.015 0.013 0.017
20 1 FROOT LOOPS 0.026 0.024 0.028
21 1 FROSTED FLAKES 0.032 0.03 0.034
22 1 FROSTED MINI WHEATS 0.021 0.019 0.023
23 1 GRAPE NUTS 0.014 0.012 0.015
24 1 KIX 0.011 0.009 0.013
25 1 LUCKY CHARMS 0.022 0.02 0.024
26 1 RAISIN BRAN 0.016 0.014 0.018
27 1 RICE KRISPIES 0.021 0.019 0.022
28 1 SHREDDED WHEAT 0.011 0.009 0.013
29 1 SMART START 0.016 0.014 0.018
30 1 SPECIAL K 0.021 0.019 0.022
  • All cereals are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
P_C_tab %>% 
  ggplot(aes(x = cereal, y = prop, fill = promo)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

  • All cereals reliably sells the most units when there is no promotion

    • We reject the null hypothesis that cereals performs equally during a promotion
    • It appears the cereal spread decreases when cereals are on promotion
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
  mutate(cereal = as.integer(cereal), promo = as.integer(promo)) %>%
  select(cereal, promo) %>% 
  chart.Correlation()

  • The relationship between cereals is statistically significant but not practically significant

6.2.2 Is there a difference among General Mills’ cereal products performance during a promotion *****************************************************************

Null: All General Mills cereals perform similarly during a promotion

Alternative: Different General Mills cereals perform differently during a promotion

chisq.test(table(gm_only$promo, gm_only$cereal))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_only$promo, gm_only$cereal)
## X-squared = 1289.5, df = 4, p-value < 0.00000000000000022
  • Due to the small p value we reject the null that GM cereals perform the same under a promotion

    • More work needed to show performance
# Make table of counts to calculate confidence interval
GM_P_C_n <- gm_only %>%
  group_by(promo, cereal) %>%
  summarise(n = n())
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
GM_P_C_n_ci <- multinomialCI(t(GM_P_C_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
GM_P_C_tab <- gm_only %>%
  group_by(promo, cereal) %>%
  summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
GM_P_C_tab$ci_1 <- round(GM_P_C_n_ci[,1], 3)
GM_P_C_tab$ci_u <- round(GM_P_C_n_ci[,2], 3)

htmlTable(GM_P_C_tab)
promo cereal prop ci_1 ci_u
1 0 CHEERIOS 0.248 0.243 0.252
2 0 CINNAMON TST CR 0.179 0.174 0.183
3 0 COCOA PUFFS 0.077 0.072 0.081
4 0 KIX 0.102 0.098 0.107
5 0 LUCKY CHARMS 0.163 0.159 0.167
6 1 CHEERIOS 0.048 0.044 0.053
7 1 CINNAMON TST CR 0.053 0.049 0.058
8 1 COCOA PUFFS 0.041 0.037 0.046
9 1 KIX 0.029 0.025 0.034
10 1 LUCKY CHARMS 0.059 0.055 0.064
  • All values are statistically different from zero so there is a difference in performance
# Graph of proportions with confidence intervals
GM_P_C_tab %>% 
  ggplot(aes(x = cereal, y = prop, fill = promo)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

    • We reject the null hypothesis that cereals performs equally during a promotion
    • It appears the cereal spread decreases when cereals are on promotion
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
  mutate(cereal = as.integer(cereal), promo = as.integer(promo)) %>%
  select(cereal, promo) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • The relationship between cereal and promo is statistically significant and practically significant

6.2.3 Is there a difference in promotion performance (units sold) over flavor types

Null: During a promotion all brand’s flavors perform similarly in units sold

Alternate: Product performance differs among the flavors during a promotion

chisq.test(table(gm_total_sales$promo, gm_total_sales$flavor))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_total_sales$promo, gm_total_sales$flavor)
## X-squared = 1199.8, df = 4, p-value < 0.00000000000000022
  • The p value is small so we reject the null that flavors perform similarly in units sold
# Make table of counts to calculate confidence interval
P_D_n <- gm_total_sales %>%
  group_by(promo, flavor) %>%
  summarise(n = n())
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
P_D_n_ci <- multinomialCI(t(P_D_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
P_D_tab <- gm_total_sales %>%
  group_by(promo, flavor) %>%
  summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
P_D_tab$ci_1 <- round(P_D_n_ci[,1], 3)
P_D_tab$ci_u <- round(P_D_n_ci[,2], 3)

htmlTable(P_D_tab)
promo flavor prop ci_1 ci_u
1 0 CINNAMON TOAST 0.066 0.063 0.069
2 0 COCOA 0.043 0.04 0.046
3 0 FRUIT 0.048 0.045 0.051
4 0 REGULAR 0.294 0.291 0.297
5 0 TOASTED 0.277 0.274 0.28
6 1 CINNAMON TOAST 0.02 0.017 0.023
7 1 COCOA 0.025 0.022 0.027
8 1 FRUIT 0.026 0.023 0.029
9 1 REGULAR 0.105 0.102 0.108
10 1 TOASTED 0.096 0.094 0.099
  • All flavors are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
P_D_tab %>% 
  ggplot(aes(x = flavor, y = prop, fill = promo)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

  • All flavors reliably sells the most units when there is no promotion

    • We reject the null hypothesis that flavors performs equally during a promotion
    • It appears the flavors spread decreases when cereals are on promotion
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
  mutate(flavor = as.integer(flavor), promo = as.integer(promo)) %>%
  select(flavor, promo) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • The relationship between flavor and promo is statistically significant but not practically significant

6.2.4 Is there a difference in General Mills product performance based on flavors during a promotion

Null: All flavors perform similarly during a promotion

Alternative: Different flavors perform differently during a promotion

chisq.test(table(gm_only$promo, gm_only$flavor))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_only$promo, gm_only$flavor)
## X-squared = 759.14, df = 3, p-value < 0.00000000000000022
  • We reject the null that all flavors perform similarly during a promotion
# Make table of counts to calculate confidence interval
GM_P_F_n <- gm_only %>%
  group_by(promo, flavor) %>%
  summarise(n = n())
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 8 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
GM_P_F_n_ci <- multinomialCI(t(GM_P_F_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
GM_P_F_tab <- gm_only %>%
  group_by(promo, flavor) %>%
  summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 8 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
GM_P_F_tab$ci_1 <- round(GM_P_F_n_ci[,1], 3)
GM_P_F_tab$ci_u <- round(GM_P_F_n_ci[,2], 3)

htmlTable(GM_P_F_tab)
promo flavor prop ci_1 ci_u
1 0 CINNAMON TOAST 0.179 0.174 0.184
2 0 COCOA 0.077 0.072 0.082
3 0 REGULAR 0.103 0.098 0.108
4 0 TOASTED 0.41 0.405 0.415
5 1 CINNAMON TOAST 0.053 0.049 0.058
6 1 COCOA 0.041 0.036 0.046
7 1 REGULAR 0.029 0.025 0.034
8 1 TOASTED 0.107 0.103 0.112
  • All flavors are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
GM_P_F_tab %>% 
  ggplot(aes(x = flavor, y = prop, fill = promo)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

  • All flavors reliably sells the most units when there is no promotion

    • We reject the null hypothesis that flavors performs equally during a promotion
    • It appears the flavors spread decreases when cereals are on promotion
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
  mutate(flavor = as.integer(flavor), promo = as.integer(promo)) %>%
  select(flavor, promo) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • Not practically significant

6.3 General Mills Ad performance

6.3.1 Is there a difference in Ad performance in units sold between brands

Null: There is no difference in ad performance between brands

Alternative: Ad performance varies between brands

chisq.test(table(gm_total_sales$ad, gm_total_sales$brand))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_total_sales$ad, gm_total_sales$brand)
## X-squared = 737.73, df = 4, p-value < 0.00000000000000022
  • Small p value so we reject the null that there is no difference in ad usage

    • From the chi-square test we can not tell directionationly
# Make table of counts to calculate confidence interval
A_B_n <- gm_total_sales %>%
  group_by(ad, brand) %>%
  summarise(n = n())
## group_by: 2 grouping variables (ad, brand)
## summarise: now 9 rows and 3 columns, one group variable remaining (ad)
# Calculate confidence interval using multinomial
A_B_n_ci <- multinomialCI(t(A_B_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
A_B_tab <- gm_total_sales %>%
  group_by(ad, brand) %>%
  summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (ad, brand)
## summarise: now 9 rows and 3 columns, one group variable remaining (ad)
# add the confidence interval to the table of proportions
A_B_tab$ci_1 <- round(A_B_n_ci[,1], 3)
A_B_tab$ci_u <- round(A_B_n_ci[,2], 3)

htmlTable(A_B_tab)
ad brand prop ci_1 ci_u
1 A GENERAL MILLS 0.028 0.025 0.031
2 A KELLOGGS 0.05 0.047 0.053
3 A POST 0.006 0.003 0.009
4 B GENERAL MILLS 0.017 0.014 0.02
5 B KELLOGGS 0.04 0.037 0.043
6 B POST 0.006 0.003 0.009
7 NONE GENERAL MILLS 0.323 0.32 0.326
8 NONE KELLOGGS 0.457 0.454 0.46
9 NONE POST 0.073 0.07 0.076
  • All Brands are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
A_B_tab %>% 
  ggplot(aes(x = brand, y = prop, fill = ad)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -.5, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

  • All Brands reliably sell the most units when there is no advertisement

    • We reject the null hypothesis that General Mills performs equally to Kelloggs and Post with ad useage
    • General Mills lags behing kelloggs in both scenarios at a 99% confidence interval
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
  mutate(brand = as.integer(brand), ad = as.integer(ad)) %>%
  select(brand, ad) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • Not practically significant

    • Since the correlation coefficient is small

6.3.2 Is there a difference among General Mills’ product performance during ad usage

Null: There is no difference in product performance with ad usage

Alternative: Product sales differ with ad usage

chisq.test(table(gm_only$ad, gm_only$cereal))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_only$ad, gm_only$cereal)
## X-squared = 558.41, df = 8, p-value < 0.00000000000000022
  • We reject the null that all products perform similarly during ads
# Make table of counts to calculate confidence interval
GM_A_C_n <- gm_only %>%
  group_by(ad, cereal) %>%
  summarise(n = n())
## group_by: 2 grouping variables (ad, cereal)
## summarise: now 15 rows and 3 columns, one group variable remaining (ad)
# Calculate confidence interval using multinomial
GM_A_C_n_ci <- multinomialCI(t(GM_A_C_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
GM_A_C_tab <- gm_only %>%
  group_by(ad, cereal) %>%
  summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (ad, cereal)
## summarise: now 15 rows and 3 columns, one group variable remaining (ad)
# add the confidence interval to the table of proportions
GM_A_C_tab$ci_1 <- round(GM_A_C_n_ci[,1], 3)
GM_A_C_tab$ci_u <- round(GM_A_C_n_ci[,2], 3)

htmlTable(GM_A_C_tab)
ad cereal prop ci_1 ci_u
1 A CHEERIOS 0.016 0.012 0.021
2 A CINNAMON TST CR 0.018 0.014 0.023
3 A COCOA PUFFS 0.012 0.007 0.017
4 A KIX 0.01 0.005 0.015
5 A LUCKY CHARMS 0.019 0.014 0.024
6 B CHEERIOS 0.011 0.006 0.015
7 B CINNAMON TST CR 0.008 0.004 0.013
8 B COCOA PUFFS 0.009 0.004 0.014
9 B KIX 0.008 0.003 0.013
10 B LUCKY CHARMS 0.011 0.006 0.016
11 NONE CHEERIOS 0.269 0.265 0.274
12 NONE CINNAMON TST CR 0.206 0.201 0.21
13 NONE COCOA PUFFS 0.097 0.092 0.102
14 NONE KIX 0.113 0.109 0.118
15 NONE LUCKY CHARMS 0.192 0.187 0.197
  • All products are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
GM_A_C_tab %>% 
  ggplot(aes(x = cereal, y = prop, fill = ad)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

    • We reject the null hypothesis that products performs equally during a promotion
    • It appears the products spread decreases when cereals are on promotion
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
  mutate(cereal = as.integer(cereal), ad = as.integer(ad)) %>%
  select(cereal, ad) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • The relationship between cereal and ad is statistically significant but not practically significant

6.3.3 Is there a difference among General Mills’ flavor performance during ad usage

Null: All flavors perform similarly during ad usage

Alternative: Flavor performance changes with ad usage

chisq.test(table(gm_only$ad, gm_only$flavor))
## 
##  Pearson's Chi-squared test
## 
## data:  table(gm_only$ad, gm_only$flavor)
## X-squared = 394.12, df = 6, p-value < 0.00000000000000022
  • We reject the null that all flavors perform similarly during ads
# Make table of counts to calculate confidence interval
GM_A_F_n <- gm_only %>%
  group_by(ad, flavor) %>%
  summarise(n = n())
## group_by: 2 grouping variables (ad, flavor)
## summarise: now 12 rows and 3 columns, one group variable remaining (ad)
# Calculate confidence interval using multinomial
GM_A_F_n_ci <- multinomialCI(t(GM_A_F_n[,3]), 0.01)

# Create a table with proportions that is ggplot friendly
GM_A_F_tab <- gm_only %>%
  group_by(ad, flavor) %>%
  summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (ad, flavor)
## summarise: now 12 rows and 3 columns, one group variable remaining (ad)
# add the confidence interval to the table of proportions
GM_A_F_tab$ci_1 <- round(GM_A_F_n_ci[,1], 3)
GM_A_F_tab$ci_u <- round(GM_A_F_n_ci[,2], 3)

htmlTable(GM_A_F_tab)
ad flavor prop ci_1 ci_u
1 A CINNAMON TOAST 0.018 0.014 0.023
2 A COCOA 0.012 0.007 0.017
3 A REGULAR 0.01 0.005 0.015
4 A TOASTED 0.035 0.03 0.04
5 B CINNAMON TOAST 0.008 0.003 0.013
6 B COCOA 0.009 0.004 0.014
7 B REGULAR 0.008 0.003 0.013
8 B TOASTED 0.022 0.017 0.026
9 NONE CINNAMON TOAST 0.206 0.201 0.21
10 NONE COCOA 0.097 0.092 0.102
11 NONE REGULAR 0.114 0.109 0.119
12 NONE TOASTED 0.46 0.456 0.465
  • All flavors are statistically different from 0, so there is a difference across the board

    • More analysis needed to determine direction
# Graph of proportions with confidence intervals
GM_A_F_tab %>% 
  ggplot(aes(x = flavor, y = prop, fill = ad)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_1, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

    • We reject the null hypothesis that flavors performs equally during a promotion
    • It appears the flavors spread decreases when cereals are on promotion
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
  mutate(flavor = as.integer(flavor), ad = as.integer(ad)) %>%
  select(flavor, ad) %>% 
  chart.Correlation()

correlation_chart
## NULL
  • The relationship between flavor and ad is statistically significant but not practically significant

6.4 Store Performance

6.4.1 Promotion frequency

Null: There is no difference in the number of stores running weekly promotions for each brands

Alternative: Different brands have dissimilar promotion usage in stores

promo_price_weekly <- gm_total_sales %>%
  filter(promo == 1) %>%
  group_by(brand, week) %>%
  summarise(average_promo_price = median(price))
## filter: removed 136,561 rows (73%), 50,889 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_non_promo_weekly_price<- gm_total_sales %>%
  filter(promo == 0) %>%
  group_by(brand, week) %>%
  summarise(average_non_promo_price = median(price))
## filter: removed 50,889 rows (27%), 136,561 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_promo_weekly_units <- gm_joined_data %>%
  filter(promo == 1) %>%
  group_by(brand, week) %>%
  summarise(average_promo_units = median(units))
## filter: removed 17,305 rows (79%), 4,545 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_non_promo_weekly_units <- gm_joined_data %>%
  filter(promo == 0) %>%
  group_by(brand, week) %>%
  summarise(average_no_promo_units = median(units))
## filter: removed 4,545 rows (21%), 17,305 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_promo_weekly_store_count <- gm_joined_data %>%
  filter(promo == 1) %>%
  group_by(brand, week) %>%
  summarise(promo_store_count = n())
## filter: removed 17,305 rows (79%), 4,545 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_no_promo_weekly_store_count <-gm_joined_data %>%
  filter(promo == 0) %>%
  group_by(brand, week) %>%
  summarise(no_promo_store_count = n())
## filter: removed 4,545 rows (21%), 17,305 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
weekly_promo_analysis <- right_join(promo_price_weekly, average_non_promo_weekly_price, by = c("brand", "week")) 
## right_join: added one column (average_non_promo_price)
##             > rows only in x  (  0)
##             > rows only in y     0
##             > matched rows     156
##             >                 =====
##             > rows total       156
weekly_promo_analysis <-  right_join(weekly_promo_analysis,  average_promo_weekly_units, by = c("brand", "week"))
## right_join: added one column (average_promo_units)
##             > rows only in x  (  0)
##             > rows only in y     0
##             > matched rows     156
##             >                 =====
##             > rows total       156
weekly_promo_analysis <-  right_join(weekly_promo_analysis,  average_non_promo_weekly_units, by = c("brand", "week"))
## right_join: added one column (average_no_promo_units)
##             > rows only in x  (  0)
##             > rows only in y     0
##             > matched rows     156
##             >                 =====
##             > rows total       156
weekly_promo_analysis <-  right_join(weekly_promo_analysis,  average_promo_weekly_store_count, by = c("brand", "week"))
## right_join: added one column (promo_store_count)
##             > rows only in x  (  0)
##             > rows only in y     0
##             > matched rows     156
##             >                 =====
##             > rows total       156
weekly_promo_analysis <-  right_join(weekly_promo_analysis,  average_no_promo_weekly_store_count, by = c("brand", "week"))
## right_join: added one column (no_promo_store_count)
##             > rows only in x  (  0)
##             > rows only in y     0
##             > matched rows     156
##             >                 =====
##             > rows total       156
# Set up data set for regression
vtable(weekly_promo_analysis)
weekly_promo_analysis
Name Class Values
brand factor ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’
week integer Num: 1 to 52
average_promo_price numeric Num: 2.12 to 4.08
average_non_promo_price numeric Num: 3.45 to 4.29
average_promo_units numeric Num: 1.5 to 18
average_no_promo_units numeric Num: 2 to 9
promo_store_count integer Num: 2 to 70
no_promo_store_count integer Num: 22 to 237
# Linear regression with linear model
store_promo_regression <- lm(promo_store_count ~ brand + week, data = weekly_promo_analysis)
# Review output
summary(store_promo_regression)
## 
## Call:
## lm(formula = promo_store_count ~ brand + week, data = weekly_promo_analysis)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.0580  -4.1579  -0.0225   3.9540  18.7438 
## 
## Coefficients:
##                Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)    25.92873    1.42473  18.199 <0.0000000000000002 ***
## brandKELLOGGS  27.55769    1.41097  19.531 <0.0000000000000002 ***
## brandPOST     -14.00000    1.41097  -9.922 <0.0000000000000002 ***
## week           -0.04956    0.03838  -1.291               0.199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.195 on 152 degrees of freedom
## Multiple R-squared:  0.8555, Adjusted R-squared:  0.8527 
## F-statistic:   300 on 3 and 152 DF,  p-value: < 0.00000000000000022
  • Small p value so we reject the null that there is no difference in number of stores where promos are ran
par(mfrow = c(1, 1))
plot(weekly_promo_analysis$promo_store_count, store_promo_regression$residuals)

# Pull out the coefficients and confidence interval for table and graph
coe <- summary(store_promo_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(store_promo_regression)[-1, ])) # find and bind CI, remove Intercept 

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
estimate se t pval low_CI high_CI
brandKELLOGGS 27.558 1.411 19.531 0 24.77 30.345
brandPOST -14 1.411 -9.922 0 -16.788 -11.212
week -0.05 0.038 -1.291 0.199 -0.125 0.026
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw()+
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI, color = "Blue") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI, color = "Blue") +
   xlab("Coefficient with Confidence Interval")+
  geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.

+ Kelloggs and Post reliably run promotions in a different number of stores than General Mills

- We reject the null hypothesis that General Mills has the same number of stores running promotions as Kelloggs and Post
- General Mills runs promotions in less stores than Kelloggs and more than Post at a 99% confidence interval
library("PerformanceAnalytics")
correlation_chart <- weekly_promo_analysis %>%
  mutate(brand = as.integer(brand)) %>%
  select(brand, promo_store_count) %>% 
  chart.Correlation()

  • Results are practically significant

6.4.2 Toasted Flavor Pormotion frequency

Null: There is no difference in promotion usage between brands for toasted flavored products

Alternative: Different brands have dissimilar promotion usage for toasted flavored products

# Set up data set for regression
vtable(toasted_weekly_promo_analysis)
toasted_weekly_promo_analysis
Name Class Values
brand factor ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’
week integer Num: 1 to 52
average_promo_price numeric Num: 2 to 3.99
average_non_promo_price numeric Num: 3.59 to 4.39
average_promo_units numeric Num: 3 to 23
average_non_promo_units numeric Num: 4 to 13.5
promo_store_count integer Num: 4 to 37
no_promo_store_count integer Num: 32 to 81
# Linear regression with linear model
toasted_store_promo_regression <- lm(promo_store_count ~ brand + week, data = toasted_weekly_promo_analysis)
# Review output
summary(toasted_store_promo_regression)
## 
## Call:
## lm(formula = promo_store_count ~ brand + week, data = toasted_weekly_promo_analysis)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.232 -2.959 -0.912  2.352 18.423 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   10.82183    1.02203  10.589 < 0.0000000000000002 ***
## brandKELLOGGS  8.05769    0.90356   8.918   0.0000000000000216 ***
## week          -0.02158    0.03010  -0.717                0.475    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.607 on 101 degrees of freedom
## Multiple R-squared:  0.4421, Adjusted R-squared:  0.4311 
## F-statistic: 40.02 on 2 and 101 DF,  p-value: 0.0000000000001587
  • Small p value so we reject the null that there is no difference in number of stores where promos are ran
par(mfrow = c(1, 1))
plot(toasted_weekly_promo_analysis$promo_store_count, toasted_store_promo_regression$residuals)

# Pull out the coefficients and confidence interval for table and graph
coe <- summary(toasted_store_promo_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(toasted_store_promo_regression)[-1, ])) # find and bind CI, remove Intercept 

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
estimate se t pval low_CI high_CI
brandKELLOGGS 8.058 0.904 8.918 0 6.265 9.85
week -0.022 0.03 -0.717 0.475 -0.081 0.038
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw()+
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI, color = "Blue") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI, color = "Blue") +
   xlab("Coefficient with Confidence Interval")+
  geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.

  • Kelloggs reliably runs promotions for toasted products in a different number of stores than General Mills

    • We reject the null hypothesis that General Mills has the same number of stores running promotions for toasted products as Kelloggs
    • General Mills runs promotions for toasted products in less stores than Kelloggs at a 99% confidence interval
library("PerformanceAnalytics")
correlation_chart <- toasted_weekly_promo_analysis %>%
  mutate(brand = as.integer(brand)) %>%
  select(brand, promo_store_count) %>% 
  chart.Correlation()

  • results are practically significant

6.4.3 Regular Flavor Pormotion frequency

Null: There is no difference in promotion usage between brands for regular flavored products

Alternative: Different brands have dissimilar promotion usage for regular flavored products

# Set up data set for regression
vtable(regular_weekly_promo_analysis)
regular_weekly_promo_analysis
Name Class Values
brand factor ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’
week integer Num: 1 to 52
average_promo_price numeric Num: 2 to 4.67
average_non_promo_price numeric Num: 3.185 to 4.19
average_promo_units numeric Num: 1 to 21
average_non_promo_units numeric Num: 2 to 12
promo_store_count integer Num: 1 to 34
no_promo_store_count integer Num: 9 to 104
# Linear regression with linear model
regular_store_promo_regression <- lm(promo_store_count ~ brand + week, data = regular_weekly_promo_analysis)
# Review output
summary(regular_store_promo_regression)
## 
## Call:
## lm(formula = promo_store_count ~ brand + week, data = regular_weekly_promo_analysis)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8851  -2.2786  -0.2662   1.8530  14.2890 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    5.05065    0.79850   6.325   0.0000000027190049 ***
## brandKELLOGGS 15.48735    0.79231  19.547 < 0.0000000000000002 ***
## brandPOST      6.71812    0.79231   8.479   0.0000000000000192 ***
## week          -0.04352    0.02146  -2.028               0.0443 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.02 on 151 degrees of freedom
## Multiple R-squared:  0.7202, Adjusted R-squared:  0.7146 
## F-statistic: 129.5 on 3 and 151 DF,  p-value: < 0.00000000000000022
  • Small p value so we reject the null that there is no difference in number of stores where promos are ran
par(mfrow = c(1, 1))
plot(regular_weekly_promo_analysis$promo_store_count, regular_store_promo_regression$residuals)

# Pull out the coefficients and confidence interval for table and graph
coe <- summary(regular_store_promo_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(regular_store_promo_regression)[-1, ])) # find and bind CI, remove Intercept 

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
estimate se t pval low_CI high_CI
brandKELLOGGS 15.487 0.792 19.547 0 13.922 17.053
brandPOST 6.718 0.792 8.479 0 5.153 8.284
week -0.044 0.021 -2.028 0.044 -0.086 -0.001
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw()+
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI, color = "Blue") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI, color = "Blue") +
   xlab("Coefficient with Confidence Interval")+
  geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.

  • Kelloggs reliably runs promotions for regular products in a different number of stores than General Mills

    • We reject the null hypothesis that General Mills has the same number of stores running promotions for regular products as Kelloggs and Post
    • General Mills runs promotions for regular products in less stores than Kelloggs and Post at a 99% confidence interval
library("PerformanceAnalytics")
correlation_chart <- regular_weekly_promo_analysis %>%
  mutate(brand = as.integer(brand)) %>%
  select(brand, promo_store_count) %>% 
  chart.Correlation()

  • results are practically significant

6.5 Professional quality visual for memo

6.5.1 Ad findings

store_ads <- gm_joined_data %>%
  filter(ad != "NONE") %>%
  group_by(brand, iri_key, ad) %>%
  summarise(number_of_ads = n())
## filter: removed 19,333 rows (88%), 2,517 rows remaining
## group_by: 3 grouping variables (brand, iri_key, ad)
## summarise: now 1,747 rows and 4 columns, 2 group variables remaining (brand, iri_key)
ads <- store_ads %>%
  group_by(brand, ad) %>%
  summarise(number_of_stores_with_ads = n(),
            total_ad_weeks_across_stores = sum(number_of_ads))
## group_by: 2 grouping variables (brand, ad)
## summarise: now 6 rows and 4 columns, one group variable remaining (brand)
# Create graph for use in memo
store_ads_final <- ads %>%
  ggplot(mapping = aes(x = reorder(brand, number_of_stores_with_ads), y = number_of_stores_with_ads, fill = ad )) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Kelloggs Runs the Most Ads In the Most Stores",
       subtitle = "Leading in both small and medium sized ads", y = "Stores", x = "", fill = "Advertisment Size") +
  coord_flip() + 
  theme_classic() +
  theme(axis.line.y = element_blank()) +
  scale_fill_manual(values = c("#CCCCFF", "#336699"), labels = c("Small Ad", "Medium Ad"))

ggsave(filename = "store_ads_final.png", plot = store_ads_final)
## Saving 7 x 5 in image
store_ads_final

promotions <- gm_joined_data %>%
  filter(promo == 1) %>%
  group_by(brand, iri_key) %>%
  summarise(number_of_promo_weeks_per_store = n())
## filter: removed 17,305 rows (79%), 4,545 rows remaining
## group_by: 2 grouping variables (brand, iri_key)
## summarise: now 2,270 rows and 3 columns, one group variable remaining (brand)
promotions <- promotions %>%
  group_by(brand) %>%
  summarise(number_of_stores_with_promos = n(),
            total_promo_weeks_across_stores = sum(number_of_promo_weeks_per_store))
## group_by: one grouping variable (brand)
## summarise: now 3 rows and 3 columns, ungrouped
promo_stores <- promotions %>%
  ggplot(mapping = aes(x = reorder(brand, number_of_stores_with_promos), y = number_of_stores_with_promos, fill = brand )) +
  geom_bar(stat = "identity", position = "dodge",  show.legend = FALSE) +
  labs(title = "Kelloggs runs the most in-store promotions", 
       subtitle = "Leading in number of stores running promos", 
       y = "Stores", x = "") +
  coord_flip()+
  clean_theme +
  theme(axis.line.y = element_blank()) +
  scale_fill_manual(values = c("darkblue", "#CCCCFF", "#336699"))

promo_weeks <- promotions %>%
  ggplot(mapping = aes(x = reorder(brand, total_promo_weeks_across_stores), y = total_promo_weeks_across_stores, fill = brand )) +
  geom_bar(stat = "identity", position = "dodge",  show.legend = FALSE) +
  labs(title = "", 
       subtitle = "Leading in number of weeks across stores running promos",
       y = "Total Promo Weeks", x = "") +
  coord_flip()+
  clean_theme +
  theme(axis.line.y = element_blank()) +
  scale_fill_manual(values = c("darkblue", "#CCCCFF", "#336699"))

promo_final <- promo_stores / promo_weeks

ggsave(filename = "promo_final.png", plot = promo_final)
## Saving 7 x 5 in image
promo_final